# OptimizingDelivery Analysis

## Import libraries

In [57]:
import os 
import pandas as pd
import numpy as np

from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
import xgboost as xgb
from sklearn.multioutput import MultiOutputClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.metrics import make_scorer
from sklearn.metrics import accuracy_score

from scipy.stats import randint, uniform


## Import Data

In [8]:
%store -r optimizingdelivery_data_dir

In [12]:
# Define dataframes and dataframe names
dataframe_names = ['customers', 'dates_2022', 'products', 'target_orders', 'order_lines', 'orders_aggregate']

In [13]:
# Reset dataframes
def reset_dataframes():

    # Read parquet and overwrite local instance, effectively resetting the dataframes
    for name in dataframe_names:

        file_path = os.path.join(optimizingdelivery_data_dir, f"{name}.parquet")

        globals()[name] = pd.read_parquet(file_path)

In [14]:
reset_dataframes()

In [15]:
print(customers.shape)
print(dates_2022.shape)
print(products.shape)
print(target_orders.shape)
print(order_lines.shape)
print(orders_aggregate.shape)

(35, 3)
(365, 4)
(18, 3)
(35, 4)
(57096, 11)
(31729, 6)


## Analysis

In [16]:
order_lines.columns

Index(['order_id', 'order_placement_date', 'customer_id', 'product_id',
       'order_qty', 'agreed_delivery_date', 'actual_delivery_date',
       'delivery_qty', 'in_full', 'on_time', 'on_time_in_full'],
      dtype='object')

In [17]:
print('order_lines')
print('ot_perc: ' + str((order_lines['on_time'].sum() / len(order_lines)) * 100))
print('if_perc: ' + str((order_lines['in_full'].sum() / len(order_lines)) * 100))
print('otif_perc: ' + str((order_lines['on_time_in_full'].sum() / len(order_lines)) * 100))


order_lines
ot_perc: 71.11706599411517
if_perc: 65.96083788706738
otif_perc: 47.95432254448648


In [18]:
print('orders_aggregate')
print('ot_perc: ' + str((orders_aggregate['on_time'].sum() / len(orders_aggregate)) * 100))
print('if_perc: ' + str((orders_aggregate['in_full'].sum() / len(orders_aggregate)) * 100))
print('otif_perc: ' + str((orders_aggregate['otif'].sum() / len(orders_aggregate)) * 100))


orders_aggregate
ot_perc: 59.03117022282455
if_perc: 52.78136720350468
otif_perc: 29.020769642913425


In [19]:
order_lines.sample(5)

Unnamed: 0,order_id,order_placement_date,customer_id,product_id,order_qty,agreed_delivery_date,actual_delivery_date,delivery_qty,in_full,on_time,on_time_in_full
34191,FJUN620621603,2022-06-17,789621,25891302,96,2022-06-20,2022-06-20,91,0,1,0
37042,FJUN629422503,2022-06-26,789422,25891502,205,2022-06-29,2022-07-01,185,0,0,0
26883,FMY527522302,2022-05-25,789522,25891101,458,2022-05-27,2022-05-29,458,1,0,0
43094,FJUL718221301,2022-07-16,789221,25891301,44,2022-07-18,2022-07-17,44,1,1,1
39848,FJUL76220602,2022-07-05,789220,25891403,408,2022-07-06,2022-07-06,408,1,1,1


In [20]:
# Calculate the mean percentages
customer_split = order_lines.groupby('customer_id')[['in_full', 'on_time', 'on_time_in_full']].mean() * 100

# Reset index to turn 'customer_id' into a regular column
customer_split = customer_split.reset_index().rename(columns={'in_full': 'if_pct',
                                                             'on_time': 'ot_pct', 
                                                             'on_time_in_full': 'otif_pct'})

customer_split = customer_split.round(2)

In [21]:
customer_split = pd.merge(customer_split, target_orders, how='left', on='customer_id')

In [22]:
customer_split.columns

Index(['customer_id', 'if_pct', 'ot_pct', 'otif_pct', 'ontime_target%',
       'infull_target%', 'otif_target%'],
      dtype='object')

In [23]:
customer_split.rename(columns={ 'infull_target%': 'if_target_pct',
                                'otif_target%': 'of_target_pct', 
                                'ontime_target%': 'otif_target_pct'}, inplace=True)

In [24]:
customer_split.columns

Index(['customer_id', 'if_pct', 'ot_pct', 'otif_pct', 'otif_target_pct',
       'if_target_pct', 'of_target_pct'],
      dtype='object')

In [25]:
customer_split = customer_split[['customer_id', 'if_pct', 'if_target_pct', 
                                 'ot_pct', 'of_target_pct', 
                                 'otif_pct', 'otif_target_pct']]

## Machine Learning Model

##### Create Predictive Models:
  
Building predictive models to determine whether an order line meets targets like 'in_full', 'on_time', or 'otif' is a fundamental step. You can use various machine learning algorithms like logistic regression, random forests, or gradient boosting, depending on the complexity of your data and the desired accuracy of predictions.
Ensure you have labeled data where each order line is marked as 'true' or 'false' for meeting the targets. Also, pay attention to feature engineering to extract relevant information from your dataset.
  
##### Identify Associated Variables:
  
Analyzing the importance of variables associated with failure for 'in_full', 'on_time', or 'otif' is crucial for understanding the root causes of delivery failures. You can use techniques like feature importance analysis from tree-based models or permutation importance to identify the most influential features.
Understanding these variables can provide insights into areas of improvement in your production process. For example, if a specific product category or supplier consistently results in delivery failures, you can focus on improving those aspects.
Predict Targets and Evaluate Model Performance:
  
After training your predictive models, assess their performance using appropriate evaluation metrics such as accuracy, precision, recall, F1-score, or area under the ROC curve (AUC). Ensure you use proper validation techniques like cross-validation to estimate the model's generalization performance.
It's essential to have reliable models that accurately predict whether an order line meets the specified targets. These models will serve as the basis for making informed decisions and implementing improvements in your production process.
  
##### Extract Insights for Improvement:
  
Once you have reliable predictive models, leverage the insights gained from them to suggest improvements in your production process. For example, if certain factors consistently lead to delivery failures, you can take corrective actions such as optimizing inventory management, improving supplier relationships, enhancing transportation logistics, or refining order processing workflows.
Continuously monitor the performance of your production process and iteratively refine your strategies based on the insights provided by the predictive models.

### Prepare model data

#### Incorporate as much relevant data into the training model

In [26]:
model_data = order_lines.copy()

##### Merge dates

In [27]:
model_data = pd.merge(model_data, dates_2022, how='left', left_on='order_placement_date', right_on='date')

model_data['order_placement_month'] = model_data['month'].astype('int64')
model_data['order_placement_day'] = model_data['order_placement_date'].dt.day.astype('int64')
model_data['order_placement_week_no'] = model_data['week_number'].astype('int64')

model_data = model_data[['order_id', 'order_placement_date', 'order_placement_month', 'order_placement_day', 'order_placement_week_no', 
                        'customer_id', 'product_id','order_qty', 'agreed_delivery_date', 'actual_delivery_date',
                        'delivery_qty', 'in_full', 'on_time', 'on_time_in_full']].copy()

In [28]:
model_data = pd.merge(model_data, dates_2022, how='left', left_on='agreed_delivery_date', right_on='date')

model_data['agreed_delivery_month'] = model_data['month'].astype('int64')
model_data['agreed_delivery_day'] = model_data['agreed_delivery_date'].dt.day.astype('int64')
model_data['agreed_delivery_week_no'] = model_data['week_number'].astype('int64')

model_data = model_data[['order_id', 'order_placement_date', 'order_placement_month', 'order_placement_day', 'order_placement_week_no', 
                        'customer_id', 'product_id','order_qty', 
                        'agreed_delivery_date', 'agreed_delivery_month', 'agreed_delivery_day', 'agreed_delivery_week_no', 
                        'actual_delivery_date',
                        'delivery_qty', 'in_full', 'on_time', 'on_time_in_full',]].copy()

In [29]:
model_data = pd.merge(model_data, dates_2022, how='left', left_on='actual_delivery_date', right_on='date')

model_data['actual_delivery_month'] = model_data['month'].astype('int64')
model_data['actual_delivery_day'] = model_data['actual_delivery_date'].dt.day.astype('int64')
model_data['actual_delivery_week_no'] = model_data['week_number'].astype('int64')

model_data = model_data[['order_id', 'order_placement_date', 'order_placement_month', 'order_placement_day', 'order_placement_week_no', 
                        'customer_id', 'product_id','order_qty', 
                        'agreed_delivery_date', 'agreed_delivery_month', 'agreed_delivery_day', 'agreed_delivery_week_no', 
                        'actual_delivery_date', 'actual_delivery_month', 'actual_delivery_day', 'actual_delivery_week_no',
                        'delivery_qty', 'in_full', 'on_time', 'on_time_in_full',]].copy()

##### Merge customer

In [30]:
model_data = pd.merge(model_data, customers, how='left', on='customer_id')

##### Merge customer

In [31]:
model_data = pd.merge(model_data, products, how='left', on='product_id')

#### Refine model data datatypes

In [32]:
model_data.dtypes

order_id                           object
order_placement_date       datetime64[ns]
order_placement_month               int64
order_placement_day                 int64
order_placement_week_no             int64
customer_id                         int64
product_id                          int64
order_qty                           int64
agreed_delivery_date       datetime64[ns]
agreed_delivery_month               int64
agreed_delivery_day                 int64
agreed_delivery_week_no             int64
actual_delivery_date       datetime64[ns]
actual_delivery_month               int64
actual_delivery_day                 int64
actual_delivery_week_no             int64
delivery_qty                        int64
in_full                             int64
on_time                             int64
on_time_in_full                     int64
customer_name                      object
city                               object
product_name                       object
category                          

In [33]:
# Change object datatyes into categorical codes
model_data['order_code'] = model_data['order_id'].astype('category').cat.codes
model_data['customer_name_code'] = model_data['customer_name'].astype('category').cat.codes
model_data['city_code'] = model_data['city'].astype('category').cat.codes
model_data['product_name_code'] = model_data['product_name'].astype('category').cat.codes
model_data['category_code'] = model_data['category'].astype('category').cat.codes

In [34]:
# Refine columns
model_data = model_data[[   'order_id',
                            'order_code',
                            'order_placement_date',
                            'order_placement_month',
                            'order_placement_day',
                            'order_placement_week_no',
                            'customer_id',
                            'customer_name',
                            'customer_name_code',
                            'city',
                            'city_code',
                            'product_id',
                            'product_name',
                            'product_name_code',
                            'category',
                            'category_code',
                            'order_qty',
                            'agreed_delivery_date',
                            'agreed_delivery_month',
                            'agreed_delivery_day',
                            'agreed_delivery_week_no',
                            'actual_delivery_date',
                            'actual_delivery_month',
                            'actual_delivery_day',
                            'actual_delivery_week_no',
                            'delivery_qty',
                            'in_full',
                            'on_time',
                            'on_time_in_full']]

# Sort model data by order placement date and reset the index
model_data.sort_values('order_placement_date', inplace=True)
model_data.reset_index(drop=True, inplace=True)

In [35]:
model_data.shape

(57096, 29)

In [36]:
model_data.isna().sum().sum()

0

### Define Functions

In [None]:
# Cross validating accuracy scores

# Define scorer
accuracy_scorer = make_scorer(accuracy_score)

# Define function
def cv_accuracy_score(model, model_data, features, targets, accuracy_scorer,):

    # Cross validation with 10 folds
        
    cv_scores = cross_val_score(model, model_data[features], model_data[targets], cv=10, scoring=accuracy_scorer)

    # Print scores 
    print(list("Cross Validation Accuracy Scores:", cv_scores))

    # Print mean accuracy
    print("Mean Accuracy:", cv_scores.mean())


### Multi-Output Random Forest Classifier

#### Model 1
  
First model, Random split model is more accurate than sequential split model  
  
Random:  
Mean Cross-validation accuracy: 0.5855006584300525  
  
Sequential:  
Mean Cross-validation accuracy: 0.365746872700515

##### Random Model

In [44]:
# Define features and targets
features = ['order_code',
            'order_placement_month',
            'order_placement_day',
            'order_placement_week_no',
            'customer_id',
            'customer_name_code',
            'city_code',
            'product_id',
            'product_name_code',
            'category_code',
            'order_qty',
            'agreed_delivery_month',
            'agreed_delivery_day',
            'agreed_delivery_week_no',
            'actual_delivery_month',
            'actual_delivery_day',
            'actual_delivery_week_no']
targets = ['on_time', 'in_full']

# Define train test split
X_train, X_test, y_train, y_test = train_test_split(model_data[features], model_data[targets], test_size=0.2, random_state=9)

# Define random forest classifier model, wrap in multi output classifier
random_forest = RandomForestClassifier(n_estimators=100, random_state=9)
multi_output = MultiOutputClassifier(random_forest, n_jobs=-1)

# Fit model
multi_output.fit(X_train, y_train)

# Predict
y_pred = multi_output.predict(X_test)

In [47]:
from sklearn.model_selection import cross_val_score
from sklearn.metrics import make_scorer, accuracy_score

# Define a custom scorer for multi-output classification
# You can use any appropriate metric for your problem, such as accuracy
scorer = make_scorer(accuracy_score)

# Perform cross-validation
# Here, we use 5-fold cross-validation, but you can adjust the number of folds as needed
cv_scores = cross_val_score(multi_output, model_data[features], model_data[targets], cv=20, scoring=scorer)

# Print the cross-validation scores
print("Cross-validation scores:", cv_scores)

# Print the mean cross-validation score
print("Mean Cross-validation score:", cv_scores.mean())


Cross-validation scores: [0.54045534 0.57758319 0.56987741 0.65043783 0.56497373 0.60700525
 0.64938704 0.56987741 0.57968476 0.64238179 0.63257443 0.60105079
 0.60980736 0.64763573 0.5821366  0.59754816 0.65557113 0.58058865
 0.58023826 0.27119832]
Mean Cross-validation score: 0.5855006584300525


##### Sequential Model

In [None]:
# Define features and targets
features = ['order_code',
            'order_placement_month',
            'order_placement_day',
            'order_placement_week_no',
            'customer_id',
            'customer_name_code',
            'city_code',
            'product_id',
            'product_name_code',
            'category_code',
            'order_qty',
            'agreed_delivery_month',
            'agreed_delivery_day',
            'agreed_delivery_week_no',
            'actual_delivery_month',
            'actual_delivery_day',
            'actual_delivery_week_no']
targets = ['on_time', 'in_full']

# Split the data into features (X) and targets (y)
X = model_data[features]
y = model_data[targets]

# Perform train-test split while preserving temporal order
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)

# Define random forest classifier model, wrap in multi output classifier
random_forest = RandomForestClassifier(n_estimators=100, random_state=9)
multi_output = MultiOutputClassifier(random_forest, n_jobs=-1)

# Fit model
multi_output.fit(X_train, y_train)

# Predict
y_pred = multi_output.predict(X_test)

In [None]:
from sklearn.model_selection import TimeSeriesSplit, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import make_scorer, accuracy_score

X = model_data[features]
y = model_data[targets]

# Define the sequential cross-validation splitter
tscv = TimeSeriesSplit(n_splits=20)

# Define a custom scorer for multi-output classification
scorer = make_scorer(accuracy_score)

# Perform cross-validation
cv_scores = cross_val_score(multi_output, X, y, cv=tscv, scoring=scorer)

# Print the cross-validation scores
print("Cross-validation scores:", cv_scores)

# Print the mean cross-validation score
print("Mean Cross-validation score:", cv_scores.mean())

#### Model 2
  
Random Search to tune hyperparameters
  
Mean Cross-validation score: 0.5975163932514909  
Paramters tuned:  
    max_depth,   
    max_features,   
    min_samples_leaf,  
    min_samples_split=,    
    n_estimators

##### Random Search 1
  
param_dist = {  
    'estimator__n_estimators': randint(10, 200),  
    'estimator__max_depth': randint(2, 20),  
    'estimator__min_samples_split': randint(2, 20),  
    'estimator__min_samples_leaf': randint(1, 20),  
    'estimator__max_features': ['auto', 'sqrt', 'log2']   }

Best Parameters: RandomForestClassifier(  
    max_depth=15,   
    max_features='log2',   
    min_samples_leaf=2,  
    min_samples_split=15,    
    n_estimators=192)

In [61]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.multioutput import MultiOutputClassifier
from scipy.stats import randint

# Define features and targets
features = ['order_code',
            'order_placement_month',
            'order_placement_day',
            'order_placement_week_no',
            'customer_id',
            'customer_name_code',
            'city_code',
            'product_id',
            'product_name_code',
            'category_code',
            'order_qty',
            'agreed_delivery_month',
            'agreed_delivery_day',
            'agreed_delivery_week_no',
            'actual_delivery_month',
            'actual_delivery_day',
            'actual_delivery_week_no']
targets = ['on_time', 'in_full']

# Define train test split
X_train, X_test, y_train, y_test = train_test_split(model_data[features], model_data[targets], test_size=0.2, random_state=9)

# Define random forest classifier model, wrap in multi output classifier
random_forest = RandomForestClassifier(random_state=9)
multi_output = MultiOutputClassifier(random_forest, n_jobs=-1)

# Define paramater ranges
param_dist = {
    'estimator__n_estimators': randint(10, 200),
    'estimator__max_depth': randint(2, 20),
    'estimator__min_samples_split': randint(2, 20),
    'estimator__min_samples_leaf': randint(1, 20),
    'estimator__max_features': ['auto', 'sqrt', 'log2']
}

# Fit model
random_search = RandomizedSearchCV(estimator=multi_output, param_distributions=param_dist, n_iter=100, cv=5, random_state=9, n_jobs=-1, scoring='accuracy')

random_search.fit(X_train, y_train)

# Best estimator
best_rf_estimator = random_search.best_estimator_.estimators_[0]



215 fits failed out of a total of 500.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
131 fits failed with the following error:
joblib.externals.loky.process_executor._RemoteTraceback: 
"""
Traceback (most recent call last):
  File "c:\Users\Olimpio Chris Campos\anaconda3\Lib\site-packages\joblib\_parallel_backends.py", line 273, in _wrap_func_call
    return func()
           ^^^^^^
  File "c:\Users\Olimpio Chris Campos\anaconda3\Lib\site-packages\joblib\parallel.py", line 589, in __call__
    return [func(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Olimpio Chris Campos\anaconda3\Lib\site-packages\joblib\parallel.py", line 589, in <listcomp>
    return [func(*args, **kwargs)
            ^^^^^^^^^^^^^^^^^^^^^
  F

In [67]:
print("Best Parameters:", best_rf_estimator)

Best Parameters: RandomForestClassifier(max_depth=15, max_features='log2', min_samples_leaf=2,
                       min_samples_split=15, n_estimators=192, random_state=9)


##### Random Search 2
  
param_dist = {  
    'estimator__n_estimators': randint(150, 250),  
    'estimator__max_depth': randint(10, 20),  
    'estimator__min_samples_split': randint(10, 20),  
    'estimator__min_samples_leaf': randint(1, 5),  
    'estimator__max_features': ['auto', 'sqrt', 'log2']  
}
  
Best Parameters: RandomForestClassifier(  
    max_depth=19,  
    max_features='log2',   
    min_samples_leaf=2,  
    min_samples_split=18,  
    n_estimators=189)

In [68]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.multioutput import MultiOutputClassifier
from scipy.stats import randint

# Define features and targets
features = ['order_code',
            'order_placement_month',
            'order_placement_day',
            'order_placement_week_no',
            'customer_id',
            'customer_name_code',
            'city_code',
            'product_id',
            'product_name_code',
            'category_code',
            'order_qty',
            'agreed_delivery_month',
            'agreed_delivery_day',
            'agreed_delivery_week_no',
            'actual_delivery_month',
            'actual_delivery_day',
            'actual_delivery_week_no']
targets = ['on_time', 'in_full']

# Define train test split
X_train, X_test, y_train, y_test = train_test_split(model_data[features], model_data[targets], test_size=0.2, random_state=9)

# Define random forest classifier model, wrap in multi output classifier
random_forest = RandomForestClassifier(random_state=9)
multi_output = MultiOutputClassifier(random_forest, n_jobs=-1)

# Define paramater ranges
param_dist = {
    'estimator__n_estimators': randint(150, 250),
    'estimator__max_depth': randint(10, 20),
    'estimator__min_samples_split': randint(10, 20),
    'estimator__min_samples_leaf': randint(1, 5),
    'estimator__max_features': ['auto', 'sqrt', 'log2']
}

# Fit model
random_search = RandomizedSearchCV(estimator=multi_output, param_distributions=param_dist, n_iter=100, cv=5, random_state=9, n_jobs=-1, scoring='accuracy')

random_search.fit(X_train, y_train)

# Best estimator
best_rf_estimator = random_search.best_estimator_.estimators_[0]

175 fits failed out of a total of 500.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
82 fits failed with the following error:
joblib.externals.loky.process_executor._RemoteTraceback: 
"""
Traceback (most recent call last):
  File "c:\Users\Olimpio Chris Campos\anaconda3\Lib\site-packages\joblib\_parallel_backends.py", line 273, in _wrap_func_call
    return func()
           ^^^^^^
  File "c:\Users\Olimpio Chris Campos\anaconda3\Lib\site-packages\joblib\parallel.py", line 589, in __call__
    return [func(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Olimpio Chris Campos\anaconda3\Lib\site-packages\joblib\parallel.py", line 589, in <listcomp>
    return [func(*args, **kwargs)
            ^^^^^^^^^^^^^^^^^^^^^
  Fi

In [69]:
print("Best Parameters:", best_rf_estimator)

Best Parameters: RandomForestClassifier(max_depth=19, max_features='log2', min_samples_split=18,
                       n_estimators=189, random_state=9)


##### Random Search 3
  
param_dist = {  
    'estimator__n_estimators': randint(170, 200),  
    'estimator__max_depth': randint(15, 25),  
    'estimator__min_samples_split': randint(15, 20),  
    'estimator__min_samples_leaf': randint(1, 5),  
    'estimator__max_features': ['auto', 'sqrt', 'log2']  
}  
    
Best Parameters: RandomForestClassifier(  
    max_depth=24,   
    max_features='log2',   
    min_samples_leaf=2,  
    min_samples_split=15,  
    n_estimators=191)  

In [71]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.multioutput import MultiOutputClassifier
from scipy.stats import randint

# Define features and targets
features = ['order_code',
            'order_placement_month',
            'order_placement_day',
            'order_placement_week_no',
            'customer_id',
            'customer_name_code',
            'city_code',
            'product_id',
            'product_name_code',
            'category_code',
            'order_qty',
            'agreed_delivery_month',
            'agreed_delivery_day',
            'agreed_delivery_week_no',
            'actual_delivery_month',
            'actual_delivery_day',
            'actual_delivery_week_no']
targets = ['on_time', 'in_full']

# Define train test split
X_train, X_test, y_train, y_test = train_test_split(model_data[features], model_data[targets], test_size=0.2, random_state=9)

# Define random forest classifier model, wrap in multi output classifier
random_forest = RandomForestClassifier(random_state=9)
multi_output = MultiOutputClassifier(random_forest, n_jobs=-1)

# Define paramater ranges
param_dist = {
    'estimator__n_estimators': randint(170, 200),
    'estimator__max_depth': randint(15, 25),
    'estimator__min_samples_split': randint(15, 20),
    'estimator__min_samples_leaf': randint(1, 5),
    'estimator__max_features': ['auto', 'sqrt', 'log2']
}

# Fit model
random_search = RandomizedSearchCV(estimator=multi_output, param_distributions=param_dist, n_iter=100, cv=5, random_state=9, n_jobs=-1, scoring='accuracy')

random_search.fit(X_train, y_train)

# Best estimator
best_rf_estimator = random_search.best_estimator_.estimators_[0]

185 fits failed out of a total of 500.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
91 fits failed with the following error:
joblib.externals.loky.process_executor._RemoteTraceback: 
"""
Traceback (most recent call last):
  File "c:\Users\Olimpio Chris Campos\anaconda3\Lib\site-packages\joblib\_parallel_backends.py", line 273, in _wrap_func_call
    return func()
           ^^^^^^
  File "c:\Users\Olimpio Chris Campos\anaconda3\Lib\site-packages\joblib\parallel.py", line 589, in __call__
    return [func(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Olimpio Chris Campos\anaconda3\Lib\site-packages\joblib\parallel.py", line 589, in <listcomp>
    return [func(*args, **kwargs)
            ^^^^^^^^^^^^^^^^^^^^^
  Fi

In [72]:
print("Best Parameters:", best_rf_estimator)

Best Parameters: RandomForestClassifier(max_depth=24, max_features='log2', min_samples_split=15,
                       n_estimators=191, random_state=9)


##### Model 2
  
With updated paramaters
  
Mean Cross-validation score: 0.5975163932514909

In [82]:
# Define features and targets
features = ['order_code',
            'order_placement_month',
            'order_placement_day',
            'order_placement_week_no',
            'customer_id',
            'customer_name_code',
            'city_code',
            'product_id',
            'product_name_code',
            'category_code',
            'order_qty',
            'agreed_delivery_month',
            'agreed_delivery_day',
            'agreed_delivery_week_no',
            'actual_delivery_month',
            'actual_delivery_day',
            'actual_delivery_week_no']
targets = ['on_time', 'in_full']

# Define train test split
X_train, X_test, y_train, y_test = train_test_split(model_data[features], model_data[targets], test_size=0.2, random_state=9)

# Define random forest classifier model, wrap in multi output classifier
random_forest = RandomForestClassifier(max_depth=24, max_features='log2', min_samples_split=15, min_samples_leaf=2, n_estimators=191, random_state=9)
multi_output = MultiOutputClassifier(random_forest, n_jobs=-1)

# Fit model
multi_output.fit(X_train, y_train)

# Predict
y_pred = multi_output.predict(X_test)

In [83]:
from sklearn.model_selection import cross_val_score
from sklearn.metrics import make_scorer, accuracy_score

# Define a custom scorer for multi-output classification
# You can use any appropriate metric for your problem, such as accuracy
scorer = make_scorer(accuracy_score)

# Perform cross-validation
# Here, we use 5-fold cross-validation, but you can adjust the number of folds as needed
cv_scores = cross_val_score(multi_output, model_data[features], model_data[targets], cv=20, scoring=scorer)

# Print the cross-validation scores
print("Cross-validation scores:", cv_scores)

# Print the mean cross-validation score
print("Mean Cross-validation score:", cv_scores.mean())


Cross-validation scores: [0.5530648  0.60350263 0.5943958  0.64903678 0.60175131 0.6150613
 0.64063047 0.59474606 0.58178634 0.64763573 0.62802102 0.61155867
 0.61015762 0.63782837 0.59544658 0.59964974 0.67028732 0.61142256
 0.59810792 0.30623686]
Mean Cross-validation score: 0.5975163932514909


In [133]:
# Define features and targets
features = ['order_code',
            'order_placement_month',
            'order_placement_day',
            'order_placement_week_no',
            'customer_id',
            'customer_name_code',
            'city_code',
            'product_id',
            'product_name_code',
            'category_code',
            'order_qty',
            'agreed_delivery_month',
            'agreed_delivery_day',
            'agreed_delivery_week_no',
            'actual_delivery_month',
            'actual_delivery_day',
            'actual_delivery_week_no']
targets = ['on_time', 'in_full']

# Define train test split
X_train, X_test, y_train, y_test = train_test_split(model_data[features], model_data[targets], test_size=0.2, random_state=7)

# Define random forest classifier model, wrap in multi output classifier
random_forest = RandomForestClassifier(max_depth=24, max_features='log2', min_samples_split=15, min_samples_leaf=2, n_estimators=191, random_state=7)
multi_output = MultiOutputClassifier(random_forest, n_jobs=-1)

# Fit model
multi_output.fit(X_train, y_train)

# Predict
y_pred = multi_output.predict(X_test)

In [134]:
from sklearn.model_selection import cross_val_score
from sklearn.metrics import make_scorer, accuracy_score

# Define a custom scorer for multi-output classification
# You can use any appropriate metric for your problem, such as accuracy
scorer = make_scorer(accuracy_score)

# Perform cross-validation
# Here, we use 5-fold cross-validation, but you can adjust the number of folds as needed
cv_scores = cross_val_score(multi_output, model_data[features], model_data[targets], cv=20, scoring=scorer)

# Print the cross-validation scores
print("Cross-validation scores:", cv_scores)

# Print the mean cross-validation score
print("Mean Cross-validation score:", cv_scores.mean())


Cross-validation scores: [0.54325744 0.60210158 0.59649737 0.64238179 0.60805604 0.61541156
 0.63922942 0.59474606 0.58388792 0.64728546 0.62416813 0.61050788
 0.61085814 0.63817863 0.59229422 0.60105079 0.66398038 0.61107218
 0.60196216 0.28065872]
Mean Cross-validation score: 0.5953792937555303


#### Model 3

Add derived columns to model data  
-delivery_duration  
-order_placement_weekday  
-agreed_delivery_weekday  
-actual_delivery_weekday  

In [37]:
model_data.shape

(57096, 29)

In [41]:
model_data['delivery_duration'] = (model_data['actual_delivery_date'] - model_data['order_placement_date']).dt.days.astype('int64')
model_data['order_placement_weekday'] = model_data['order_placement_date'].dt.weekday.astype('int64')
model_data['agreed_delivery_weekday'] = model_data['agreed_delivery_date'].dt.weekday.astype('int64')
model_data['actual_delivery_weekday'] = model_data['actual_delivery_date'].dt.weekday.astype('int64')

In [234]:
# Define features and targets
features = ['order_code',
            'order_placement_month',
            'order_placement_day',
            'order_placement_week_no',
            'customer_id',
            'customer_name_code',
            'city_code',
            'product_id',
            'product_name_code',
            'category_code',
            'order_qty',
            'agreed_delivery_month',
            'agreed_delivery_day',
            'agreed_delivery_week_no',
            'actual_delivery_month',
            'actual_delivery_day',
            'actual_delivery_week_no',
            'delivery_duration',
            'order_placement_weekday',
            'agreed_delivery_weekday',
            'actual_delivery_weekday']
targets = ['on_time', 'in_full']

# Define train test split
X_train, X_test, y_train, y_test = train_test_split(model_data[features], model_data[targets], test_size=0.2, random_state=9)

# Define random forest classifier model, wrap in multi output classifier
random_forest = RandomForestClassifier(max_depth=24, max_features='log2', min_samples_split=15, min_samples_leaf=2, n_estimators=191, random_state=9)
multi_output = MultiOutputClassifier(random_forest, n_jobs=-1)

# Fit model
multi_output.fit(X_train, y_train)

# Predict
y_pred = multi_output.predict(X_test)

In [235]:
from sklearn.model_selection import cross_val_score
from sklearn.metrics import make_scorer, accuracy_score

# Define a custom scorer for multi-output classification
# You can use any appropriate metric for your problem, such as accuracy
scorer = make_scorer(accuracy_score)

# Perform cross-validation
# Here, we use 5-fold cross-validation, but you can adjust the number of folds as needed
cv_scores = cross_val_score(multi_output, model_data[features], model_data[targets], cv=20, scoring=scorer)

# Print the cross-validation scores
print("Cross-validation scores:", cv_scores)

# Print the mean cross-validation score
print("Mean Cross-validation score:", cv_scores.mean())


Cross-validation scores: [0.68231173 0.69176883 0.67425569 0.70402802 0.69281961 0.69527145
 0.69352014 0.67110333 0.69492119 0.70017513 0.69211909 0.69877408
 0.69457093 0.69001751 0.68196147 0.69562172 0.7091801  0.6899089
 0.69446391 0.68114926]
Mean Cross-validation score: 0.6913971051168545


In [None]:
Mean Cross-validation score: 0.6913971051168545


In [None]:
# Reverse delivery duration

In [219]:
# Define features and targets
features = ['order_code',
            'order_placement_month',
            'order_placement_day',
            'order_placement_week_no',
            'customer_id',
            'customer_name_code',
            'city_code',
            'product_id',
            'product_name_code',
            'category_code',
            'order_qty',
            'agreed_delivery_month',
            'agreed_delivery_day',
            'agreed_delivery_week_no',
            'actual_delivery_month',
            'actual_delivery_day',
            'actual_delivery_week_no',
            'delivery_duration_2',
            'order_placement_weekday',
            'agreed_delivery_weekday',
            'actual_delivery_weekday']
targets = ['on_time', 'in_full']

# Define train test split
X_train, X_test, y_train, y_test = train_test_split(model_data[features], model_data[targets], test_size=0.2, random_state=9)

# Define random forest classifier model, wrap in multi output classifier
random_forest = RandomForestClassifier(max_depth=24, max_features='log2', min_samples_split=15, min_samples_leaf=2, n_estimators=191, random_state=9)
multi_output = MultiOutputClassifier(random_forest, n_jobs=-1)

# Fit model
multi_output.fit(X_train, y_train)

# Predict
y_pred = multi_output.predict(X_test)

In [220]:
from sklearn.model_selection import cross_val_score
from sklearn.metrics import make_scorer, accuracy_score

# Define a custom scorer for multi-output classification
# You can use any appropriate metric for your problem, such as accuracy
scorer = make_scorer(accuracy_score)

# Perform cross-validation
# Here, we use 5-fold cross-validation, but you can adjust the number of folds as needed
cv_scores = cross_val_score(multi_output, model_data[features], model_data[targets], cv=20, scoring=scorer)

# Print the cross-validation scores
print("Cross-validation scores:", cv_scores)

# Print the mean cross-validation score
print("Mean Cross-validation score:", cv_scores.mean())


Cross-validation scores: [0.68336252 0.68861646 0.6647986  0.70507881 0.69597198 0.69842382
 0.69001751 0.67075306 0.69211909 0.69807356 0.69211909 0.69842382
 0.6882662  0.69772329 0.68056042 0.69527145 0.7067274  0.68570427
 0.69306237 0.68044849]
Mean Cross-validation score: 0.690276111077702


In [232]:
# Define features and targets
features = ['order_code',
            'order_placement_month',
            'order_placement_day',
            'order_placement_week_no',
            'customer_id',
            'customer_name_code',
            'city_code',
            'product_id',
            'product_name_code',
            'category_code',
            'order_qty',
            'agreed_delivery_month',
            'agreed_delivery_day',
            'agreed_delivery_week_no',
            'actual_delivery_month',
            'actual_delivery_day',
            'actual_delivery_week_no',
            'delivery_duration',
            'delivery_duration_2',
            'order_placement_weekday',
            'agreed_delivery_weekday',
            'actual_delivery_weekday']
targets = ['on_time', 'in_full']

# Define train test split
X_train, X_test, y_train, y_test = train_test_split(model_data[features], model_data[targets], test_size=0.2, random_state=9)

# Define random forest classifier model, wrap in multi output classifier
random_forest = RandomForestClassifier(max_depth=24, max_features='log2', min_samples_split=15, min_samples_leaf=2, n_estimators=191, random_state=9)
multi_output = MultiOutputClassifier(random_forest, n_jobs=-1)

# Fit model
multi_output.fit(X_train, y_train)

# Predict
y_pred = multi_output.predict(X_test)

In [233]:
from sklearn.model_selection import cross_val_score
from sklearn.metrics import make_scorer, accuracy_score

# Define a custom scorer for multi-output classification
# You can use any appropriate metric for your problem, such as accuracy
scorer = make_scorer(accuracy_score)

# Perform cross-validation
# Here, we use 5-fold cross-validation, but you can adjust the number of folds as needed
cv_scores = cross_val_score(multi_output, model_data[features], model_data[targets], cv=20, scoring=scorer)

# Print the cross-validation scores
print("Cross-validation scores:", cv_scores)

# Print the mean cross-validation score
print("Mean Cross-validation score:", cv_scores.mean())


Cross-validation scores: [0.64588441 0.68756567 0.66549912 0.70963222 0.68966725 0.69772329
 0.68931699 0.66900175 0.69001751 0.69457093 0.68861646 0.69316988
 0.68931699 0.68966725 0.67915937 0.68896673 0.70812894 0.68430273
 0.69131044 0.67098809]
Mean Cross-validation score: 0.6861253017548725


In [None]:
# Add additional columns to my model data

In [218]:
model_data.shape

(57096, 34)

In [245]:
rolling_windows = [1, 3, 5]

cols_to_roll = ['delivery_duration',
                'order_placement_weekday',
                'agreed_delivery_weekday',
                'actual_delivery_weekday']

new_cols = [f'{c}_rolling' for c in cols_to_roll]

# Define function to add rolling averages
def rolling_averages(group, cols_to_roll, rolling_windows):
    group = group.sort_values('order_placement_date')
    for window in rolling_windows:
        rolling_stats = group[cols_to_roll].rolling(window, closed='left').mean()
        rolling_cols = [f'{c}_rolling_{window}' for c in cols_to_roll]
        group[rolling_cols] = rolling_stats
        group = group.dropna(subset=rolling_cols)
    return group

# Apply function
rolling_customers = model_data.groupby('customer_name').apply(lambda x: rolling_averages(x, cols_to_roll, rolling_windows))

  rolling_customers = model_data.groupby('customer_name').apply(lambda x: rolling_averages(x, cols_to_roll, rolling_windows))


In [246]:
# Define features and targets
features = ['order_code',
            'order_placement_month',
            'order_placement_day',
            'order_placement_week_no',
            'customer_id',
            'customer_name_code',
            'city_code',
            'product_id',
            'product_name_code',
            'category_code',
            'order_qty',
            'agreed_delivery_month',
            'agreed_delivery_day',
            'agreed_delivery_week_no',
            'actual_delivery_month',
            'actual_delivery_day',
            'actual_delivery_week_no',
            'delivery_duration',
            'order_placement_weekday',
            'agreed_delivery_weekday',
            'actual_delivery_weekday',
            'delivery_duration_rolling_1',
            'order_placement_weekday_rolling_1',
            'agreed_delivery_weekday_rolling_1',
            'actual_delivery_weekday_rolling_1',
            'delivery_duration_rolling_3',
            'order_placement_weekday_rolling_3',
            'agreed_delivery_weekday_rolling_3',
            'actual_delivery_weekday_rolling_3',
            'delivery_duration_rolling_5',
            'order_placement_weekday_rolling_5',
            'agreed_delivery_weekday_rolling_5',
            'actual_delivery_weekday_rolling_5']
targets = ['on_time', 'in_full']

# Define train test split
X_train, X_test, y_train, y_test = train_test_split(rolling_customers[features], rolling_customers[targets], test_size=0.2, random_state=9)

# Define random forest classifier model, wrap in multi output classifier
random_forest = RandomForestClassifier(max_depth=24, max_features='log2', min_samples_split=15, min_samples_leaf=2, n_estimators=191, random_state=9)
multi_output = MultiOutputClassifier(random_forest, n_jobs=-1)

# Fit model
multi_output.fit(X_train, y_train)

# Predict
y_pred = multi_output.predict(X_test)

In [247]:
from sklearn.model_selection import cross_val_score
from sklearn.metrics import make_scorer, accuracy_score

# Define a custom scorer for multi-output classification
# You can use any appropriate metric for your problem, such as accuracy
scorer = make_scorer(accuracy_score)

# Perform cross-validation
# Here, we use 5-fold cross-validation, but you can adjust the number of folds as needed
cv_scores = cross_val_score(multi_output, rolling_customers[features], rolling_customers[targets], cv=20, scoring=scorer)

# Print the cross-validation scores
print("Cross-validation scores:", cv_scores)

# Print the mean cross-validation score
print("Mean Cross-validation score:", cv_scores.mean())


Cross-validation scores: [0.54615655 0.60603933 0.70294944 0.70259831 0.45189607 0.54213483
 0.69206461 0.7113764  0.62289326 0.55477528 0.68047753 0.51544944
 0.56882022 0.71207865 0.69592697 0.71699438 0.49964888 0.57654494
 0.59129213 0.67345506]
Mean Cross-validation score: 0.6181786138246813


In [248]:
rolling_windows = [1, 3, 5]

cols_to_roll = ['delivery_duration']

new_cols = [f'{c}_rolling' for c in cols_to_roll]

# Define function to add rolling averages
def rolling_averages(group, cols_to_roll, rolling_windows):
    group = group.sort_values('order_placement_date')
    for window in rolling_windows:
        rolling_stats = group[cols_to_roll].rolling(window, closed='left').mean()
        rolling_cols = [f'{c}_rolling_{window}' for c in cols_to_roll]
        group[rolling_cols] = rolling_stats
        group = group.dropna(subset=rolling_cols)
    return group

# Apply function
rolling_customers = model_data.groupby('customer_name').apply(lambda x: rolling_averages(x, cols_to_roll, rolling_windows))

  rolling_customers = model_data.groupby('customer_name').apply(lambda x: rolling_averages(x, cols_to_roll, rolling_windows))


In [250]:
# Define features and targets
features = ['order_code',
            'order_placement_month',
            'order_placement_day',
            'order_placement_week_no',
            'customer_id',
            'customer_name_code',
            'city_code',
            'product_id',
            'product_name_code',
            'category_code',
            'order_qty',
            'agreed_delivery_month',
            'agreed_delivery_day',
            'agreed_delivery_week_no',
            'actual_delivery_month',
            'actual_delivery_day',
            'actual_delivery_week_no',
            'delivery_duration',
            'order_placement_weekday',
            'agreed_delivery_weekday',
            'actual_delivery_weekday',
            'delivery_duration_rolling_1',
            'delivery_duration_rolling_3',
            'delivery_duration_rolling_5',]
targets = ['on_time', 'in_full']

# Define train test split
X_train, X_test, y_train, y_test = train_test_split(rolling_customers[features], rolling_customers[targets], test_size=0.2, random_state=9)

# Define random forest classifier model, wrap in multi output classifier
random_forest = RandomForestClassifier(max_depth=24, max_features='log2', min_samples_split=15, min_samples_leaf=2, n_estimators=191, random_state=9)
multi_output = MultiOutputClassifier(random_forest, n_jobs=-1)

# Fit model
multi_output.fit(X_train, y_train)

# Predict
y_pred = multi_output.predict(X_test)

In [251]:
from sklearn.model_selection import cross_val_score
from sklearn.metrics import make_scorer, accuracy_score

# Define a custom scorer for multi-output classification
# You can use any appropriate metric for your problem, such as accuracy
scorer = make_scorer(accuracy_score)

# Perform cross-validation
# Here, we use 5-fold cross-validation, but you can adjust the number of folds as needed
cv_scores = cross_val_score(multi_output, rolling_customers[features], rolling_customers[targets], cv=20, scoring=scorer)

# Print the cross-validation scores
print("Cross-validation scores:", cv_scores)

# Print the mean cross-validation score
print("Mean Cross-validation score:", cv_scores.mean())


Cross-validation scores: [0.52439452 0.57654494 0.70049157 0.68995787 0.45575843 0.53019663
 0.69311798 0.70154494 0.61200843 0.54810393 0.6815309  0.51369382
 0.56636236 0.70856742 0.68785112 0.70962079 0.49648876 0.58286517
 0.58813202 0.6695927 ]
Mean Cross-validation score: 0.6118412149837712


In [253]:
rolling_windows = [1, 3, 5]

cols_to_roll = ['delivery_duration']

new_cols = [f'{c}_rolling' for c in cols_to_roll]

# Define function to add rolling averages
def rolling_averages(group, cols_to_roll, rolling_windows):
    group = group.sort_values('order_placement_date')
    for window in rolling_windows:
        rolling_stats = group[cols_to_roll].rolling(window, closed='left').mean()
        rolling_cols = [f'{c}_rolling_{window}' for c in cols_to_roll]
        group[rolling_cols] = rolling_stats
        group = group.dropna(subset=rolling_cols)
    return group

# Apply function
rolling_customers = model_data.groupby('city').apply(lambda x: rolling_averages(x, cols_to_roll, rolling_windows))

  rolling_customers = model_data.groupby('city').apply(lambda x: rolling_averages(x, cols_to_roll, rolling_windows))


In [254]:
# Define features and targets
features = ['order_code',
            'order_placement_month',
            'order_placement_day',
            'order_placement_week_no',
            'customer_id',
            'customer_name_code',
            'city_code',
            'product_id',
            'product_name_code',
            'category_code',
            'order_qty',
            'agreed_delivery_month',
            'agreed_delivery_day',
            'agreed_delivery_week_no',
            'actual_delivery_month',
            'actual_delivery_day',
            'actual_delivery_week_no',
            'delivery_duration',
            'order_placement_weekday',
            'agreed_delivery_weekday',
            'actual_delivery_weekday',
            'delivery_duration_rolling_1',
            'delivery_duration_rolling_3',
            'delivery_duration_rolling_5',]
targets = ['on_time', 'in_full']

# Define train test split
X_train, X_test, y_train, y_test = train_test_split(rolling_customers[features], rolling_customers[targets], test_size=0.2, random_state=9)

# Define random forest classifier model, wrap in multi output classifier
random_forest = RandomForestClassifier(max_depth=24, max_features='log2', min_samples_split=15, min_samples_leaf=2, n_estimators=191, random_state=9)
multi_output = MultiOutputClassifier(random_forest, n_jobs=-1)

# Fit model
multi_output.fit(X_train, y_train)

# Predict
y_pred = multi_output.predict(X_test)

In [255]:
from sklearn.model_selection import cross_val_score
from sklearn.metrics import make_scorer, accuracy_score

# Define a custom scorer for multi-output classification
# You can use any appropriate metric for your problem, such as accuracy
scorer = make_scorer(accuracy_score)

# Perform cross-validation
# Here, we use 5-fold cross-validation, but you can adjust the number of folds as needed
cv_scores = cross_val_score(multi_output, rolling_customers[features], rolling_customers[targets], cv=20, scoring=scorer)

# Print the cross-validation scores
print("Cross-validation scores:", cv_scores)

# Print the mean cross-validation score
print("Mean Cross-validation score:", cv_scores.mean())


Cross-validation scores: [0.61913104 0.68255081 0.67519271 0.6899089  0.67659425 0.69796776
 0.64505957 0.63489839 0.64435879 0.65685244 0.68454259 0.67718191
 0.63617245 0.65474939 0.70031546 0.67928496 0.69120224 0.67542937
 0.68384157 0.67122327]
Mean Cross-validation score: 0.6688228941074581


In [256]:
rolling_windows = [1, 3, 5]

cols_to_roll = ['delivery_duration']

new_cols = [f'{c}_rolling' for c in cols_to_roll]

# Define function to add rolling averages
def rolling_averages(group, cols_to_roll, rolling_windows):
    group = group.sort_values('order_placement_date')
    for window in rolling_windows:
        rolling_stats = group[cols_to_roll].rolling(window, closed='left').mean()
        rolling_cols = [f'{c}_rolling_{window}' for c in cols_to_roll]
        group[rolling_cols] = rolling_stats
        group = group.dropna(subset=rolling_cols)
    return group

# Apply function
rolling_customers = model_data.groupby('product_name').apply(lambda x: rolling_averages(x, cols_to_roll, rolling_windows))

  rolling_customers = model_data.groupby('product_name').apply(lambda x: rolling_averages(x, cols_to_roll, rolling_windows))


In [257]:
from sklearn.model_selection import cross_val_score
from sklearn.metrics import make_scorer, accuracy_score

# Define a custom scorer for multi-output classification
# You can use any appropriate metric for your problem, such as accuracy
scorer = make_scorer(accuracy_score)

# Perform cross-validation
# Here, we use 5-fold cross-validation, but you can adjust the number of folds as needed
cv_scores = cross_val_score(multi_output, rolling_customers[features], rolling_customers[targets], cv=20, scoring=scorer)

# Print the cross-validation scores
print("Cross-validation scores:", cv_scores)

# Print the mean cross-validation score
print("Mean Cross-validation score:", cv_scores.mean())


Cross-validation scores: [0.67053038 0.6968739  0.69406393 0.70530383 0.68282403 0.68212153
 0.67579909 0.69546891 0.68984896 0.69301019 0.67685283 0.70249385
 0.68387777 0.68071654 0.69571328 0.68306395 0.6978215  0.68341532
 0.67357695 0.67357695]
Mean Cross-validation score: 0.6868476847199688


In [258]:
from sklearn.model_selection import cross_val_score
from sklearn.metrics import make_scorer, accuracy_score

# Define a custom scorer for multi-output classification
# You can use any appropriate metric for your problem, such as accuracy
scorer = make_scorer(accuracy_score)

# Perform cross-validation
# Here, we use 5-fold cross-validation, but you can adjust the number of folds as needed
cv_scores = cross_val_score(multi_output, rolling_customers[features], rolling_customers[targets], cv=20, scoring=scorer)

# Print the cross-validation scores
print("Cross-validation scores:", cv_scores)

# Print the mean cross-validation score
print("Mean Cross-validation score:", cv_scores.mean())


Cross-validation scores: [0.67053038 0.6968739  0.69406393 0.70530383 0.68282403 0.68212153
 0.67579909 0.69546891 0.68984896 0.69301019 0.67685283 0.70249385
 0.68387777 0.68071654 0.69571328 0.68306395 0.6978215  0.68341532
 0.67357695 0.67357695]
Mean Cross-validation score: 0.6868476847199688


In [None]:
# ADd more models to ensbmle

In [174]:
from sklearn.ensemble import GradientBoostingClassifier

# Define features and targets
features = ['order_code',
            'order_placement_month',
            'order_placement_day',
            'order_placement_week_no',
            'customer_id',
            'customer_name_code',
            'city_code',
            'product_id',
            'product_name_code',
            'category_code',
            'order_qty',
            'agreed_delivery_month',
            'agreed_delivery_day',
            'agreed_delivery_week_no',
            'actual_delivery_month',
            'actual_delivery_day',
            'actual_delivery_week_no',
            'delivery_duration',
            'order_placement_weekday',
            'agreed_delivery_weekday',
            'actual_delivery_weekday']
targets = ['on_time', 'in_full']

# Define train test split
X_train, X_test, y_train, y_test = train_test_split(model_data[features], model_data[targets], test_size=0.2, random_state=9)

# Define gradient boosting classifier model, wrap in multi output classifier
gradient_boosting = GradientBoostingClassifier(max_depth=3, n_estimators=100, random_state=9)
multi_output_gbm = MultiOutputClassifier(gradient_boosting, n_jobs=-1)

# Fit model
multi_output_gbm.fit(X_train, y_train)

# Predict
y_pred = multi_output_gbm.predict(X_test)


In [175]:
from sklearn.model_selection import cross_val_score
from sklearn.metrics import make_scorer, accuracy_score

# Define a custom scorer for multi-output classification
scorer = make_scorer(accuracy_score)

# Perform cross-validation
cv_scores = cross_val_score(multi_output_gbm, model_data[features], model_data[targets], cv=20, scoring='accuracy')

# Print the cross-validation scores
print("Cross-validation scores:", cv_scores)

# Print the mean cross-validation score
print("Mean Cross-validation score:", cv_scores.mean())


Cross-validation scores: [0.57443082 0.67390543 0.65464098 0.68721541 0.68791594 0.67355517
 0.67285464 0.6647986  0.69281961 0.68721541 0.68441331 0.69772329
 0.68721541 0.67460595 0.66024518 0.67635727 0.69796776 0.67379117
 0.67344078 0.68325158]
Mean Cross-validation score: 0.6739181865375905


In [176]:
import xgboost as xgb
from sklearn.multioutput import MultiOutputClassifier
from sklearn.model_selection import train_test_split

# Define features and targets
features = ['order_code',
            'order_placement_month',
            'order_placement_day',
            'order_placement_week_no',
            'customer_id',
            'customer_name_code',
            'city_code',
            'product_id',
            'product_name_code',
            'category_code',
            'order_qty',
            'agreed_delivery_month',
            'agreed_delivery_day',
            'agreed_delivery_week_no',
            'actual_delivery_month',
            'actual_delivery_day',
            'actual_delivery_week_no',
            'delivery_duration',
            'order_placement_weekday',
            'agreed_delivery_weekday',
            'actual_delivery_weekday']
targets = ['on_time', 'in_full']

# Define train test split
X_train, X_test, y_train, y_test = train_test_split(model_data[features], model_data[targets], test_size=0.2, random_state=9)

# Define XGBoost classifier model, wrap in multi output classifier
xgb_classifier = xgb.XGBClassifier(max_depth=3, n_estimators=100, random_state=9)
multi_output_xgb = MultiOutputClassifier(xgb_classifier, n_jobs=-1)

# Fit model
multi_output_xgb.fit(X_train, y_train)

# Predict
y_pred = multi_output_xgb.predict(X_test)

In [177]:
from sklearn.model_selection import cross_val_score
from sklearn.metrics import make_scorer, accuracy_score

# Define a custom scorer for multi-output classification
# You can use any appropriate metric for your problem, such as accuracy
scorer = make_scorer(accuracy_score)

# Perform cross-validation
# Here, we use 5-fold cross-validation, but you can adjust the number of folds as needed
cv_scores = cross_val_score(xgb_classifier, model_data[features], model_data[targets], cv=20, scoring=scorer)

# Print the cross-validation scores
print("Cross-validation scores:", cv_scores)

# Print the mean cross-validation score
print("Mean Cross-validation score:", cv_scores.mean())

Cross-validation scores: [0.69947461 0.71768827 0.71383538 0.73870403 0.72399299 0.72644483
 0.73695271 0.71033275 0.73099825 0.74255692 0.72784588 0.7408056
 0.73870403 0.72889667 0.73274956 0.71628722 0.74912404 0.7428171
 0.70532586 0.72529783]
Mean Cross-validation score: 0.727441726179007


##### 0.691

In [None]:
# Define features and targets
features = ['order_code',
            'order_placement_month',
            'order_placement_day',
            'order_placement_week_no',
            'customer_id',
            'customer_name_code',
            'city_code',
            'product_id',
            'product_name_code',
            'category_code',
            'order_qty',
            'agreed_delivery_month',
            'agreed_delivery_day',
            'agreed_delivery_week_no',
            'actual_delivery_month',
            'actual_delivery_day',
            'actual_delivery_week_no',
            'delivery_duration',
            'order_placement_weekday',
            'agreed_delivery_weekday',
            'actual_delivery_weekday']
targets = ['on_time', 'in_full']

# Define train test split
X_train, X_test, y_train, y_test = train_test_split(model_data[features], model_data[targets], test_size=0.2, random_state=9)

# Define random forest classifier model, wrap in multi output classifier
random_forest = RandomForestClassifier(max_depth=24, max_features='log2', min_samples_split=15, min_samples_leaf=2, n_estimators=191, random_state=9)
multi_output = MultiOutputClassifier(random_forest, n_jobs=-1)

# Fit model
multi_output.fit(X_train, y_train)

# Predict
y_pred = multi_output.predict(X_test)

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.metrics import make_scorer, accuracy_score

# Define a custom scorer for multi-output classification
# You can use any appropriate metric for your problem, such as accuracy
scorer = make_scorer(accuracy_score)

# Perform cross-validation
# Here, we use 5-fold cross-validation, but you can adjust the number of folds as needed
cv_scores = cross_val_score(multi_output, model_data[features], model_data[targets], cv=20, scoring=scorer)

# Print the cross-validation scores
print("Cross-validation scores:", cv_scores)

# Print the mean cross-validation score
print("Mean Cross-validation score:", cv_scores.mean())


In [47]:
# Define features and targets
features = ['order_code',
            'order_placement_month',
            'order_placement_day',
            'order_placement_week_no',
            'customer_id',
            'customer_name_code',
            'city_code',
            'product_id',
            'product_name_code',
            'category_code',
            'order_qty',
            'agreed_delivery_month',
            'agreed_delivery_day',
            'agreed_delivery_week_no',
            'actual_delivery_month',
            'actual_delivery_day',
            'actual_delivery_week_no',
            'delivery_duration',
            'order_placement_weekday',
            'agreed_delivery_weekday',
            'actual_delivery_weekday']
targets = ['on_time', 'in_full']

# Define train test split
X_train, X_test, y_train, y_test = train_test_split(model_data[features], model_data[targets], test_size=0.2, random_state=9)

# Define random forest classifier model, wrap in multi output classifier
random_forest = RandomForestClassifier( n_estimators=214, 
                                        max_depth=29, 
                                        max_features='log2',
                                        min_samples_split=19,
                                        min_samples_leaf=2, 
                                        bootstrap=False,
                                        criterion='entropy',
                                        random_state=9)
multi_output = MultiOutputClassifier(random_forest, n_jobs=-1)

# Fit model
multi_output.fit(X_train, y_train)

# Predict
y_pred = multi_output.predict(X_test)

In [48]:
from sklearn.model_selection import cross_val_score
from sklearn.metrics import make_scorer, accuracy_score

# Define a custom scorer for multi-output classification
# You can use any appropriate metric for your problem, such as accuracy
scorer = make_scorer(accuracy_score)

# Perform cross-validation
# Here, we use 5-fold cross-validation, but you can adjust the number of folds as needed
cv_scores = cross_val_score(multi_output, model_data[features], model_data[targets], cv=20, scoring=scorer)

# Print the cross-validation scores
print("Cross-validation scores:", cv_scores)

# Print the mean cross-validation score
print("Mean Cross-validation score:", cv_scores.mean())

Cross-validation scores: [0.67180385 0.6938704  0.66900175 0.71278459 0.69912434 0.70683012
 0.70157618 0.67390543 0.70087566 0.70017513 0.70087566 0.70577933
 0.69737303 0.69912434 0.68721541 0.69527145 0.71653819 0.69341275
 0.69411352 0.68395235]
Mean Cross-validation score: 0.6951801754258933


##### Random Search

In [45]:
# Define features and targets
features = ['order_code',
            'order_placement_month',
            'order_placement_day',
            'order_placement_week_no',
            'customer_id',
            'customer_name_code',
            'city_code',
            'product_id',
            'product_name_code',
            'category_code',
            'order_qty',
            'agreed_delivery_month',
            'agreed_delivery_day',
            'agreed_delivery_week_no',
            'actual_delivery_month',
            'actual_delivery_day',
            'actual_delivery_week_no',
            'delivery_duration',
            'order_placement_weekday',
            'agreed_delivery_weekday',
            'actual_delivery_weekday']
targets = ['on_time', 'in_full']

# Define train test split
X_train, X_test, y_train, y_test = train_test_split(model_data[features], model_data[targets], test_size=0.2, random_state=9)

# Define random forest classifier model, wrap in multi output classifier
random_forest = RandomForestClassifier(max_features='log2', random_state=9)
multi_output = MultiOutputClassifier(random_forest, n_jobs=-1)

# Define paramater ranges
param_dist = {
    'estimator__n_estimators': randint(190, 235),
    'estimator__max_depth': randint(24, 30),
    'estimator__min_samples_split': randint(10, 20),
    'estimator__min_samples_leaf': randint(1, 5),
    'estimator__bootstrap': [True, False], 
    'estimator__class_weight': [None, 'balanced'], 
    'estimator__criterion': ['gini', 'entropy'],
    'estimator__max_leaf_nodes': [None, randint(10, 20)]
}

# Fit model
random_search = RandomizedSearchCV(estimator=multi_output, param_distributions=param_dist, n_iter=200, cv=5, random_state=9, n_jobs=-1, scoring='accuracy')

random_search.fit(X_train, y_train)

# Best estimator
best_rf_estimator = random_search.best_estimator_.estimators_[0]

print("Best Parameters:", best_rf_estimator)

490 fits failed out of a total of 1000.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
1 fits failed with the following error:
joblib.externals.loky.process_executor._RemoteTraceback: 
"""
Traceback (most recent call last):
  File "c:\Users\Olimpio Chris Campos\anaconda3\Lib\site-packages\joblib\_parallel_backends.py", line 273, in _wrap_func_call
    return func()
           ^^^^^^
  File "c:\Users\Olimpio Chris Campos\anaconda3\Lib\site-packages\joblib\parallel.py", line 589, in __call__
    return [func(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Olimpio Chris Campos\anaconda3\Lib\site-packages\joblib\parallel.py", line 589, in <listcomp>
    return [func(*args, **kwargs)
            ^^^^^^^^^^^^^^^^^^^^^
  Fi

Best Parameters: RandomForestClassifier(bootstrap=False, criterion='entropy', max_depth=29,
                       max_features='log2', min_samples_split=19,
                       n_estimators=214, random_state=9)


In [53]:
# Define features and targets
features = ['order_code',
            'order_placement_month',
            'order_placement_day',
            'order_placement_week_no',
            'customer_id',
            'customer_name_code',
            'city_code',
            'product_id',
            'product_name_code',
            'category_code',
            'order_qty',
            'agreed_delivery_month',
            'agreed_delivery_day',
            'agreed_delivery_week_no',
            'actual_delivery_month',
            'actual_delivery_day',
            'actual_delivery_week_no',
            'delivery_duration',
            'order_placement_weekday',
            'agreed_delivery_weekday',
            'actual_delivery_weekday']
targets = ['on_time', 'in_full']

# Define train test split
X_train, X_test, y_train, y_test = train_test_split(model_data[features], model_data[targets], test_size=0.2, random_state=9)

# Define gradient boosting classifier model
gradient_boosting = GradientBoostingClassifier(random_state=9)
multi_output_gbm = MultiOutputClassifier(gradient_boosting, n_jobs=-1)

# Define parameter ranges
param_dist = {
    'estimator__n_estimators': randint(50, 500),  # Number of boosting stages
    'estimator__learning_rate': uniform(0.01, 0.5),  # Learning rate
    'estimator__max_depth': randint(3, 10),  # Maximum depth of each tree
    'estimator__min_samples_split': randint(2, 20),  # Minimum number of samples required to split a node
    'estimator__min_samples_leaf': randint(1, 10),  # Minimum number of samples required at each leaf node
    'estimator__max_features': ['auto', 'sqrt', 'log2'],  # Number of features to consider at each split
    'estimator__subsample': uniform(0.5, 0.5),  # Fraction of samples to be used for fitting each base learner
    'estimator__loss': ['deviance', 'exponential']  # Loss function to be optimized
}

# Create RandomizedSearchCV object
random_search = RandomizedSearchCV(estimator=multi_output_gbm, 
                                   param_distributions=param_dist, 
                                   n_iter=100, 
                                   cv=5, 
                                   random_state=9, 
                                   n_jobs=-1, 
                                   scoring='accuracy')

# Perform random search
random_search.fit(X_train, y_train)

# Best estimator
best_gb_estimator = random_search.best_estimator_

print("Best Parameters:", best_gb_estimator)

360 fits failed out of a total of 500.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
145 fits failed with the following error:
joblib.externals.loky.process_executor._RemoteTraceback: 
"""
Traceback (most recent call last):
  File "c:\Users\Olimpio Chris Campos\anaconda3\Lib\site-packages\joblib\_parallel_backends.py", line 273, in _wrap_func_call
    return func()
           ^^^^^^
  File "c:\Users\Olimpio Chris Campos\anaconda3\Lib\site-packages\joblib\parallel.py", line 589, in __call__
    return [func(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Olimpio Chris Campos\anaconda3\Lib\site-packages\joblib\parallel.py", line 589, in <listcomp>
    return [func(*args, **kwargs)
            ^^^^^^^^^^^^^^^^^^^^^
  F

Best Parameters: MultiOutputClassifier(estimator=GradientBoostingClassifier(learning_rate=0.08379344283843525,
                                                           loss='exponential',
                                                           max_depth=4,
                                                           max_features='log2',
                                                           min_samples_split=10,
                                                           n_estimators=399,
                                                           random_state=9,
                                                           subsample=0.8625736391100656),
                      n_jobs=-1)


In [54]:
# Define features and targets
features = ['order_code',
            'order_placement_month',
            'order_placement_day',
            'order_placement_week_no',
            'customer_id',
            'customer_name_code',
            'city_code',
            'product_id',
            'product_name_code',
            'category_code',
            'order_qty',
            'agreed_delivery_month',
            'agreed_delivery_day',
            'agreed_delivery_week_no',
            'actual_delivery_month',
            'actual_delivery_day',
            'actual_delivery_week_no',
            'delivery_duration',
            'order_placement_weekday',
            'agreed_delivery_weekday',
            'actual_delivery_weekday']
targets = ['on_time', 'in_full']

# Define train test split
X_train, X_test, y_train, y_test = train_test_split(model_data[features], model_data[targets], test_size=0.2, random_state=9)

# Define gradient boosting classifier model, wrap in multi output classifier
gradient_boosting = GradientBoostingClassifier( n_estimators=399,
                                                learning_rate=0.08379344283843525,
                                                max_depth=4,
                                                min_samples_split=10,
                                                max_features='log2',
                                                subsample=0.8625736391100656,
                                                loss='exponential',
                                                random_state=9)
multi_output_gbm = MultiOutputClassifier(gradient_boosting, n_jobs=-1)

# Fit model
multi_output_gbm.fit(X_train, y_train)

# Predict
y_pred = multi_output_gbm.predict(X_test)

In [55]:
from sklearn.model_selection import cross_val_score
from sklearn.metrics import make_scorer, accuracy_score

# Define a custom scorer for multi-output classification
scorer = make_scorer(accuracy_score)

# Perform cross-validation
cv_scores = cross_val_score(multi_output_gbm, model_data[features], model_data[targets], cv=20, scoring='accuracy')

# Print the cross-validation scores
print("Cross-validation scores:", cv_scores)

# Print the mean cross-validation score
print("Mean Cross-validation score:", cv_scores.mean())

Cross-validation scores: [0.7323993  0.74255692 0.72889667 0.72854641 0.71418564 0.74991243
 0.7323993  0.70683012 0.72994746 0.73415061 0.7323993  0.74045534
 0.73835377 0.7323993  0.7323993  0.74115587 0.74597057 0.74211633
 0.74141556 0.7210932 ]
Mean Cross-validation score: 0.7333791698013172


In [58]:
# Define features and targets
features = ['order_code',
            'order_placement_month',
            'order_placement_day',
            'order_placement_week_no',
            'customer_id',
            'customer_name_code',
            'city_code',
            'product_id',
            'product_name_code',
            'category_code',
            'order_qty',
            'agreed_delivery_month',
            'agreed_delivery_day',
            'agreed_delivery_week_no',
            'actual_delivery_month',
            'actual_delivery_day',
            'actual_delivery_week_no',
            'delivery_duration',
            'order_placement_weekday',
            'agreed_delivery_weekday',
            'actual_delivery_weekday']
targets = ['on_time', 'in_full']

# Define train test split
X_train, X_test, y_train, y_test = train_test_split(model_data[features], model_data[targets], test_size=0.2, random_state=9)

# Define XGBoost classifier model, wrap in multi output classifier
xgb_classifier = xgb.XGBClassifier(random_state=9)
multi_output_xgb = MultiOutputClassifier(xgb_classifier, n_jobs=-1)

# Define parameter ranges
param_dist = {
    'estimator__n_estimators': randint(50, 500),
    'estimator__learning_rate': uniform(0.01, 0.5),
    'estimator__max_depth': randint(3, 10),
    'estimator__min_samples_split': randint(2, 20),
    'estimator__min_samples_leaf': randint(1, 10),
    'estimator__max_features': ['auto', 'sqrt', 'log2'],
    'estimator__subsample': uniform(0.5, 0.5),
    'estimator__loss': ['deviance', 'exponential'],
    'estimator__gamma': uniform(0, 0.5), 
    'estimator__reg_alpha': uniform(0, 1),  
    'estimator__reg_lambda': uniform(0, 1),
    'estimator__scale_pos_weight': [1, 5, 10]
}

# Create RandomizedSearchCV object
random_search = RandomizedSearchCV(estimator=multi_output_xgb, 
                                   param_distributions=param_dist, 
                                   n_iter=100, 
                                   cv=5, 
                                   random_state=9, 
                                   n_jobs=-1, 
                                   scoring='accuracy')

# Perform random search
random_search.fit(X_train, y_train)

# Best estimator
best_gb_estimator = random_search.best_estimator_

print("Best Parameters:", best_gb_estimator)

Best Parameters: MultiOutputClassifier(estimator=XGBClassifier(base_score=None, booster=None,
                                              callbacks=None,
                                              colsample_bylevel=None,
                                              colsample_bynode=None,
                                              colsample_bytree=None,
                                              device=None,
                                              early_stopping_rounds=None,
                                              enable_categorical=False,
                                              eval_metric=None,
                                              feature_types=None,
                                              gamma=0.06979642478434417,
                                              grow_policy=None,
                                              importance_type=None,
                                              interaction_constraints=None,
                    

In [68]:
best_params = best_gb_estimator.estimators_[0].get_params()
print("Best Parameters:")
for param, value in best_params.items():
    print(f"{param}: {value}")


Best Parameters:
objective: binary:logistic
base_score: None
booster: None
callbacks: None
colsample_bylevel: None
colsample_bynode: None
colsample_bytree: None
device: None
early_stopping_rounds: None
enable_categorical: False
eval_metric: None
feature_types: None
gamma: 0.06979642478434417
grow_policy: None
importance_type: None
interaction_constraints: None
learning_rate: 0.23560185298243658
max_bin: None
max_cat_threshold: None
max_cat_to_onehot: None
max_delta_step: None
max_depth: 4
max_leaves: None
min_child_weight: None
missing: nan
monotone_constraints: None
multi_strategy: None
n_estimators: 111
n_jobs: None
num_parallel_tree: None
random_state: 9
reg_alpha: 0.9233154819848048
reg_lambda: 0.3860309321526918
sampling_method: None
scale_pos_weight: 1
subsample: 0.5951524634804044
tree_method: None
validate_parameters: None
verbosity: None
loss: deviance
max_features: log2
min_samples_leaf: 3
min_samples_split: 14


In [72]:
import xgboost as xgb
from sklearn.multioutput import MultiOutputClassifier
from sklearn.model_selection import train_test_split

# Define features and targets
features = ['order_code',
            'order_placement_month',
            'order_placement_day',
            'order_placement_week_no',
            'customer_id',
            'customer_name_code',
            'city_code',
            'product_id',
            'product_name_code',
            'category_code',
            'order_qty',
            'agreed_delivery_month',
            'agreed_delivery_day',
            'agreed_delivery_week_no',
            'actual_delivery_month',
            'actual_delivery_day',
            'actual_delivery_week_no',
            'delivery_duration',
            'order_placement_weekday',
            'agreed_delivery_weekday',
            'actual_delivery_weekday']
targets = ['on_time', 'in_full']

# Define train test split
X_train, X_test, y_train, y_test = train_test_split(model_data[features], model_data[targets], test_size=0.2, random_state=9)

# Define XGBoost classifier model, wrap in multi output classifier
xgb_classifier = xgb.XGBClassifier( n_estimators=111,
                                    learning_rate=0.23560185298243658,
                                    max_depth=4,
                                    min_child_weight=None,
                                    subsample=0.5951524634804044,
                                    colsample_bytree=None,
                                    gamma=0.06979642478434417,
                                    reg_alpha=0.9233154819848048,
                                    reg_lambda=0.3860309321526918,
                                    scale_pos_weight=1,
                                    objective='binary:logistic')
multi_output_xgb = MultiOutputClassifier(xgb_classifier, n_jobs=-1)

# Fit model
multi_output_xgb.fit(X_train, y_train)

# Predict
y_pred = multi_output_xgb.predict(X_test)

In [74]:
from sklearn.model_selection import cross_val_score
from sklearn.metrics import make_scorer, accuracy_score

# Define a custom scorer for multi-output classification
# You can use any appropriate metric for your problem, such as accuracy
scorer = make_scorer(accuracy_score)

# Perform cross-validation
# Here, we use 5-fold cross-validation, but you can adjust the number of folds as needed
cv_scores = cross_val_score(xgb_classifier, model_data[features], model_data[targets], cv=10, scoring=scorer)

# Print the cross-validation scores
print("Cross-validation scores:", cv_scores)

# Print the mean cross-validation score
print("Mean Cross-validation score:", cv_scores.mean())

Cross-validation scores: [0.73765324 0.73152364 0.741331   0.73677758 0.73327496 0.73782837
 0.7356805  0.73603083 0.74408828 0.73095113]
Mean Cross-validation score: 0.7365139536032301


In [75]:
from sklearn.model_selection import cross_val_score
from sklearn.metrics import make_scorer, accuracy_score

# Define a custom scorer for multi-output classification
# You can use any appropriate metric for your problem, such as accuracy
scorer = make_scorer(accuracy_score)

# Perform cross-validation
# Here, we use 5-fold cross-validation, but you can adjust the number of folds as needed
cv_scores = cross_val_score(xgb_classifier, model_data[features], model_data[targets], cv=10, scoring=scorer)

# Print the cross-validation scores
print("Cross-validation scores:", cv_scores)

# Print the mean cross-validation score
print("Mean Cross-validation score:", cv_scores.mean())

Cross-validation scores: [0.73765324 0.73152364 0.741331   0.73677758 0.73327496 0.73782837
 0.7356805  0.73603083 0.74408828 0.73095113]
Mean Cross-validation score: 0.7365139536032301


In [None]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from xgboost import XGBClassifier
from sklearn.multioutput import MultiOutputClassifier

# Initialize the voting classifier
voting_classifier = VotingClassifier(estimators=[
    ('random_forest', multi_rf),
    ('gradient_boosting', multi_gb),
    ('xgboost', multi_xgb)
], voting='soft')  # You can choose 'hard' or 'soft' voting strategy

# Train the voting classifier
voting_classifier.fit(X_train, y_train)

# Evaluate the ensemble model
accuracy = voting_classifier.score(X_test, y_test)
print("Ensemble Model Accuracy:", accuracy)
