In [13]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV, cross_val_score, KFold
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC


In [5]:
# Read the training data CSV into a dataframe
DATA_PATH = "../data"
training_data = pd.read_csv(f'{DATA_PATH}/training_data.csv')

# Display the first few rows of the dataframe
training_data.head()

Unnamed: 0,DISTANCE,DELIVERY_HOUR,DELIVERY_WINDOW_DURATION,SHIPMENT_DURATION,DURATION_PER_DISTANCE,DELAYED,IS_VAN
0,184.080096,22,1.0,4.666667,0.025351,1,0
1,198.282622,13,0.5,7.0,0.035303,1,0
2,157.269664,8,1.0,4.75,0.030203,0,0
3,32.103778,14,0.5,5.0,0.155745,1,0
4,100.569459,8,0.5,2.0,0.019887,1,0


In [6]:
X = training_data.drop("DELAYED", axis=1)
y = training_data["DELAYED"]

In [38]:
models = list(
    zip(
        [GradientBoostingClassifier, RandomForestClassifier, SVC, LogisticRegression],
        ["GradientBoostingClassifier", "RandomForestClassifier", "SVC", "LogisticRegression"],
    )
)


In [41]:
param_distributions = {
    "RandomForestClassifier": {
        "n_estimators": [10, 50, 100, 200],
        "max_features": [3, 5, 7],
        "max_depth": [5, 10, 20, 50],
        "criterion": ["gini", "entropy"]
    },
    "LogisticRegression": {
        "penalty": ["l1", "l2", None],
        "C": [0.1, 1, 10, 100],
        "solver": ["newton-cg", "lbfgs", "liblinear"]
    },
    "SVC": {
        "C": [0.1, 1, 10, 100],
        "kernel": ["linear", "poly", "rbf", "sigmoid"],
        "degree": [2, 3, 4],
        "gamma": ["scale", "auto"]
    },
    "GradientBoostingClassifier": {
        "n_estimators": [50, 100, 200],
        "learning_rate": [0.01, 0.1, 0.2],
        "max_depth": [3, 5, 7],
        "subsample": [0.8, 0.9, 1.0],
        "min_samples_split": [2, 5, 10]
    }
}

In [None]:
results = []

cv = KFold(n_splits=5, random_state=99, shuffle=True)

for model_class, model_name in models:
    model = model_class()
    param_dist = param_distributions[model_name]
    random_search = RandomizedSearchCV(model, param_distributions=param_dist, n_iter=10, cv=cv, n_jobs=-1, random_state=99)
    random_search.fit(X, y)
    best_estimator = random_search.best_estimator_
    cv_scores = cross_val_score(best_estimator, X, y, cv=cv, n_jobs=-1, scoring='f1_weighted')
    
    # Get predictions and classification report
    y_pred = best_estimator.predict(X)
    report = classification_report(y, y_pred, output_dict=True)
    
    results.append({
        'Model': model_name,
        'Best Params': random_search.best_params_,
        'CV Score Mean': cv_scores.mean(),
        'CV Score Std': cv_scores.std(),
        'F1 Score': report['weighted avg']['f1-score'],
        'Precision': report['weighted avg']['precision'],
        'Recall': report['weighted avg']['recall']
    })

results_df = pd.DataFrame(results)
results_df