In [None]:
import pandas as pd
df = pd.read_csv('/content/nuevoDFconminutos.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 110159 entries, 0 to 110158
Data columns (total 68 columns):
 #   Column                       Non-Null Count   Dtype  
---  ------                       --------------   -----  
 0   order_id                     110159 non-null  object 
 1   cliente_id                   110159 non-null  object 
 2   orden_compra_timestamp       110159 non-null  object 
 3   orden_entrega_transportista  110159 non-null  object 
 4   fecha_de_entrega_estimada    110159 non-null  object 
 5   fecha_entrega_al_cliente     110159 non-null  object 
 6   fecha_de_entrega_limite      110159 non-null  object 
 7   ciudad_cliente               110159 non-null  object 
 8   estado_del_cliente           110159 non-null  object 
 9   region                       110159 non-null  object 
 10  lat_cliente                  110159 non-null  float64
 11  lon_cliente                  110159 non-null  float64
 12  dc_asignado                  110159 non-null  object 
 13 

In [None]:
df = df.dropna()
# --- Código para eliminar duplicados por 'id' manteniendo el mayor 'total_peso_g' ---
# Asumimos que 'id' y 'total_peso_g' existen en el DataFrame 'df' después de cargar el CSV.

# Ordenar el DataFrame por 'id' y 'total_peso_g' en orden descendente
df = df.sort_values(by=['order_id', 'total_peso_g'], ascending=[True, False])

# Eliminar filas duplicadas basadas en 'id', manteniendo la que tiene mayor 'total_peso_g'
df = df.drop_duplicates(subset=['order_id'], keep='first')

# --- Fin del código de eliminación de duplicados ---
cols_excluir = [col for col in df.columns if "id" in col or "fecha" in col or "timestamp" in col or "orden" in col]

# Agrega las nuevas columnas a la lista si no están ya incluidas por las palabras clave
nuevas_cols_a_eliminar = [
    "dias_a_transportista",
    "dias_transporte_a_cliente",
    "colchon_dias",
    "diferencia_entrega_estimada",
    "despacho_tarde",
    "llego_tarde",
    "entrega_a_tiempo",
    "desviacion_vs_promesa",
    'desviacion_entrega'
]

for col in nuevas_cols_a_eliminar:
    if col not in cols_excluir:
        cols_excluir.append(col)

df = df.drop(columns=cols_excluir)
cols_to_drop = [col for col in df.columns if col.endswith('_y')]
df = df.drop(columns=cols_to_drop)
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 95973 entries, 94313 to 66078
Data columns (total 42 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   ciudad_cliente          95973 non-null  object 
 1   estado_del_cliente      95973 non-null  object 
 2   region                  95973 non-null  object 
 3   lat_cliente             95973 non-null  float64
 4   lon_cliente             95973 non-null  float64
 5   dc_asignado             95973 non-null  object 
 6   distancia_km            95973 non-null  float64
 7   dias_entrega            95973 non-null  int64  
 8   es_feriado              95973 non-null  int64  
 9   es_fin_de_semana        95973 non-null  int64  
 10  Categoría               95973 non-null  object 
 11  categoría_peso          95973 non-null  object 
 12  total_peso_g            95973 non-null  float64
 13  #_deproductos           95973 non-null  int64  
 14  precio                  95973 non-null 

In [None]:
# === CREAR CATEGORÍA DE CLASIFICACIÓN ===
def clasificar_dias(dias):
    if dias <= 3:
        return "1-3 días"
    elif dias <= 10:
        return "4-10 días"
    elif dias <= 20:
        return "11-20 días"
    else:
        return "20+"

df["clase_entrega"] = df["dias_entrega"].apply(clasificar_dias)

In [None]:
if 'dias_entrega' in df.columns:
    df = df.drop(columns=['dias_entrega'])

In [None]:
# === IMPORTS ===
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report, accuracy_score
from imblearn.over_sampling import RandomOverSampler # Import RandomOverSampler

# === VARIABLES INDEPENDIENTES Y TARGET ===
X = df[[    'rango_distancia',
    'Estado',
    'Ciudad',
    'Nombre',
    'icon_x',
    'conditions_x',
    'datetime_x',
    'name_x',
    'ciudad_para_clima',
    'mes_año',
    'nombre_dia',
    'periodo_dia',
    'tipo_de_pago',
    'categoría_peso',
    'Categoría',
    'dc_asignado',
    'region',
    'estado_del_cliente',
    'ciudad_cliente',
    'costo_relativo_envio',
    'duracion_estimada_dias',
    'duracion_estimada_min',
    'lon_origen',
    'lat_origen',
    'cloudcover_x',
    'temp_x',
    'hora_compra',
    'año',
    'mes',
    'dia_semana',
    'pago',
    'costo_de_flete',
]]

y = df["clase_entrega"] # Target is the existing categorical 'clase_entrega'

# === SEPARACIÓN DE VARIABLES POR TIPO ===
num_cols = X.select_dtypes(include=np.number).columns.tolist()
cat_cols = X.select_dtypes(include="object").columns.tolist()

# === PIPELINE DE PREPROCESAMIENTO ===
preprocessor = ColumnTransformer(transformers=[
    ("num", StandardScaler(), num_cols),
    ("cat", OneHotEncoder(handle_unknown="ignore", sparse_output=False), cat_cols)
])

# === TRAIN / TEST SPLIT ===
# Split data *before* applying oversampling
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

# === APPLY PREPROCESSING TO TRAINING DATA ===
# Apply the preprocessing steps to the training data
X_train_processed = preprocessor.fit_transform(X_train)

# === APPLY OVERSAMPLING (RandomOverSampler) TO PROCESSED TRAINING DATA ===
ros = RandomOverSampler(random_state=42)
X_train_resampled, y_train_resampled = ros.fit_resample(X_train_processed, y_train)
print(f"Original training samples: {X_train.shape[0]}")
print(f"Resampled training samples: {X_train_resampled.shape[0]}")
# Print the class distribution before and after resampling to see the effect
print("\nClass distribution before resampling (training data):")
print(y_train.value_counts())
print("\nClass distribution after resampling:")
print(y_train_resampled.value_counts())


# === TRAIN THE CLASSIFIER ON THE RESAMPLED DATA ===
# Train the MLPClassifier separately on the resampled data
classifier = MLPClassifier(hidden_layer_sizes=(64, 32), activation="relu", max_iter=300, random_state=42)
classifier.fit(X_train_resampled, y_train_resampled)


# === APPLY PREPROCESSING TO TEST DATA ===
# Apply the same preprocessing steps to the test data (without oversampling)
X_test_processed = preprocessor.transform(X_test)

# === EVALUATION ===
# Predict using the trained classifier on the processed test data
y_pred = classifier.predict(X_test_processed)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nReporte de clasificación:\n", classification_report(y_test, y_pred))

Original training samples: 67181
Resampled training samples: 119472

Class distribution before resampling (training data):
clase_entrega
4-10 días     29868
11-20 días    23346
20+            9149
1-3 días       4818
Name: count, dtype: int64

Class distribution after resampling:
clase_entrega
11-20 días    29868
4-10 días     29868
20+           29868
1-3 días      29868
Name: count, dtype: int64




Accuracy: 0.49006668519033064

Reporte de clasificación:
               precision    recall  f1-score   support

    1-3 días       0.34      0.37      0.35      2065
  11-20 días       0.47      0.47      0.47     10005
         20+       0.31      0.33      0.32      3921
   4-10 días       0.59      0.58      0.58     12801

    accuracy                           0.49     28792
   macro avg       0.43      0.44      0.43     28792
weighted avg       0.49      0.49      0.49     28792

