In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, f1_score,classification_report, confusion_matrix, ConfusionMatrixDisplay

from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB

In [2]:
data = pd.read_csv("data/entregas_paqueteria.csv")
data.head()

Unnamed: 0,Distancia_km,Indice_Trafico,Indice_Clima,Paradas_Previas,Experiencia_Repartidor_meses,Hora_Dia,Zona_Riesgo,Peso_kg,Entrega_Tarde
0,17.22,59.7,3.3,8,58.7,17,0.087,16.64,1
1,1.82,64.2,5.2,6,59.4,16,0.46,11.61,0
2,5.9,55.4,6.7,2,4.4,12,0.473,3.61,0
3,5.02,16.4,0.1,4,13.5,12,0.3,12.27,0
4,4.81,19.2,5.6,6,20.0,12,0.083,17.19,0


In [3]:
data.shape

(650, 9)

In [4]:
data.describe()

Unnamed: 0,Distancia_km,Indice_Trafico,Indice_Clima,Paradas_Previas,Experiencia_Repartidor_meses,Hora_Dia,Zona_Riesgo,Peso_kg,Entrega_Tarde
count,650.0,650.0,650.0,650.0,650.0,650.0,650.0,650.0,650.0
mean,12.859769,55.005077,4.800923,4.392308,29.522154,13.663077,0.496434,10.291215,0.687692
std,7.072714,25.291095,2.849107,2.77146,17.4668,4.835222,0.281528,5.472492,0.463791
min,0.5,10.3,0.0,0.0,0.0,6.0,0.002,0.52,0.0
25%,6.9425,33.625,2.4,2.0,13.925,10.0,0.27225,5.81,0.0
50%,12.335,55.55,4.7,4.0,28.95,13.0,0.4845,10.77,1.0
75%,19.06,75.5,7.275,7.0,45.1,18.0,0.726,14.7625,1.0
max,24.85,100.0,10.0,9.0,59.9,22.0,0.999,19.97,1.0


In [5]:
X=data.iloc[:,0:8]
X.head()

Unnamed: 0,Distancia_km,Indice_Trafico,Indice_Clima,Paradas_Previas,Experiencia_Repartidor_meses,Hora_Dia,Zona_Riesgo,Peso_kg
0,17.22,59.7,3.3,8,58.7,17,0.087,16.64
1,1.82,64.2,5.2,6,59.4,16,0.46,11.61
2,5.9,55.4,6.7,2,4.4,12,0.473,3.61
3,5.02,16.4,0.1,4,13.5,12,0.3,12.27
4,4.81,19.2,5.6,6,20.0,12,0.083,17.19


In [6]:
Y=data.iloc[:,8]
Y.head()

0    1
1    0
2    0
3    0
4    0
Name: Entrega_Tarde, dtype: int64

In [7]:
X_train, X_test, Y_train, Y_test = train_test_split(X,Y,test_size=0.2, stratify=Y, random_state=42)

In [8]:
cv_inner=StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

In [None]:
pipelines = {
    "KNN": Pipeline([("scaler", StandardScaler()),
                     ("model", KNeighborsClassifier())]),
    "SVC": Pipeline([("scaler", StandardScaler()),
                     ("model", SVC())]),
    "Tree": Pipeline([("model", DecisionTreeClassifier())]),
    "Bayes": Pipeline([("model", GaussianNB())])
}

In [10]:
param_grids = {
    "KNN": {
        "model__n_neighbors": [5,7,8,9],
        "model__weights": ["uniform", "distance"]
    },
    "SVC": {
        "model__C": [0.1,1,10],
        "model__kernel": ["rbf", "linear"]
    },
    "Tree": {
        "model__max_depth": [None,3,5,10]
    },
    "Bayes": {
        "model__var_smoothing": [1e-9,1e-8,1e-7]
    }
}

In [11]:
best_models={}
resuls={}
for name in pipelines:
    grid=GridSearchCV(estimator=pipelines[name], param_grid=param_grids[name],cv=cv_inner, scoring="accuracy", n_jobs=-1)
    grid.fit(X_train, Y_train)
    best_models[name]=grid.best_estimator_
    resuls[name]={"Best params": grid.best_params_, "CV Accuracy": grid.best_score_}

In [12]:
for name, model in best_models.items():
    y_pred=model.predict(X_test)
    test_acc=accuracy_score(Y_test,y_pred)
    resuls[name]["Test accuracy"]=test_acc

In [13]:
pd.set_option("display.max_colwidth", None)
result_data=pd.DataFrame(resuls).T
result_data

Unnamed: 0,Best params,CV Accuracy,Test accuracy
KNN,"{'model__n_neighbors': 8, 'model__weights': 'distance'}",0.844231,0.807692
SVC,"{'model__C': 1, 'model__kernel': 'rbf'}",0.876923,0.907692
Tree,{'model__max_depth': 5},0.907692,0.884615
Bayes,{'model__var_smoothing': 1e-09},0.840385,0.892308
