In [1]:
import os
import pandas as pd
from sqlalchemy import create_engine, text
from getpass import getpass
from pathlib import Path
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import math
import scipy.stats as stats
from scipy.stats import spearmanr
from sklearn.ensemble import RandomForestRegressor
from sklearn.inspection import permutation_importance
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import KBinsDiscretizer
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler

In [2]:
# Connection parameters to PostgreSQL
DB_USER = "postgres"         
DB_PASSWORD = getpass()
DB_HOST = "localhost"         
DB_PORT = "5432"             
DB_NAME = "ecommerce"         

# Create connection
engine = create_engine(f'postgresql://{DB_USER}:{DB_PASSWORD}@{DB_HOST}:{DB_PORT}/{DB_NAME}')

 ········


In [3]:
df = pd.read_sql('''
SELECT *
FROM order_sample_ready_to_ml_features_fixed
''', engine).astype(float)

df.delivery_time = df.delivery_time.clip(lower=0)
df.estimated_delivery_days = df.estimated_delivery_days.clip(lower=0)

df.head()

Unnamed: 0,category_share_other_category_sum,sel2cust_dist,category_count_moveis_decoracao_sum,avg_review_score_bin,product_width_cm_min,category_share_utilidades_domesticas_sum,category_share_informatica_acessorios_sum,product_height_cm_min,category_count_beleza_saude_sum,product_name_lenght_std,...,payment_installments,category_count_telefonia_sum,estimated_delivery_days,category_count_esporte_lazer_sum,category_count_cama_mesa_banho_sum,price_min,category_count_other_category_sum,freight_value_min,category_count_automotivo_sum,delivery_time
0,1.0,0.0,0.0,2.0,14.0,0.0,0.0,9.0,0.0,0.0,...,2.0,0.0,15.0,0.0,0.0,58.9,1.0,13.29,0.0,7.0
1,1.0,5.51012,0.0,1.0,40.0,0.0,0.0,30.0,0.0,0.0,...,3.0,0.0,18.0,0.0,0.0,239.9,1.0,19.93,0.0,16.0
2,0.0,2.865357,1.0,1.0,33.0,0.0,0.0,13.0,0.0,0.0,...,5.0,0.0,21.0,0.0,0.0,199.0,0.0,17.87,0.0,8.0
3,1.0,2.652272,0.0,1.0,15.0,0.0,0.0,10.0,0.0,0.0,...,2.0,0.0,11.0,0.0,0.0,12.99,1.0,12.79,0.0,6.0
4,0.0,6.316069,0.0,1.0,30.0,0.0,0.0,40.0,0.0,0.0,...,3.0,0.0,40.0,0.0,0.0,199.9,0.0,18.14,0.0,25.0


In [4]:
X = df.drop(columns=["delivery_time"])  
y = df["delivery_time"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [5]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

In [11]:
# Normalize data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

model = keras.Sequential([
    layers.Dense(128, activation='relu', input_shape=(X_train.shape[1],)),
    layers.Dropout(0.2),
    layers.Dense(64, activation='relu'),
    layers.Dropout(0.2),
    layers.Dense(32, activation='relu'),
    layers.Dense(1)  # Выходной слой для регрессии
])

model.compile(optimizer=keras.optimizers.Adam(learning_rate=0.01),  
              loss='mse',
              metrics=['mae'])

history = model.fit(X_train_scaled, y_train,  
                    validation_data=(X_test_scaled, y_test),  
                    epochs=50, batch_size=32, verbose=1)


# Train metrics
y_pred = np.clip(model.predict(X_train_scaled), a_min=0, a_max=None)

mae = mean_absolute_error(y_train, y_pred)
mse = mean_squared_error(y_train, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(np.log(1+y_train), np.log(1+y_pred))

print(f"Metrics Train (simple NN):")
print(f"MAE: {mae:.4f}")
print(f"MSE: {mse:.4f}")
print(f"RMSE: {rmse:.4f}")
print(f"R² Score (log space): {r2:.4f}")

# Test metrics
y_pred = np.clip(model.predict(X_test_scaled), a_min=0, a_max=None)

mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(np.log(1+y_test), np.log(1+y_pred))

print(f"\nMetrics Test (simple NN):")
print(f"MAE: {mae:.4f}")
print(f"MSE: {mse:.4f}")
print(f"RMSE: {rmse:.4f}")
print(f"R² Score (log space): {r2:.4f}")

Epoch 1/50


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m2410/2410[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 606us/step - loss: 82.1248 - mae: 5.6460 - val_loss: 66.5587 - val_mae: 5.2505
Epoch 2/50
[1m2410/2410[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 583us/step - loss: 73.8628 - mae: 5.2938 - val_loss: 67.3271 - val_mae: 5.1611
Epoch 3/50
[1m2410/2410[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 608us/step - loss: 72.9842 - mae: 5.2760 - val_loss: 66.4549 - val_mae: 5.3128
Epoch 4/50
[1m2410/2410[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 616us/step - loss: 70.2837 - mae: 5.2520 - val_loss: 66.3240 - val_mae: 5.3563
Epoch 5/50
[1m2410/2410[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 588us/step - loss: 71.8807 - mae: 5.2971 - val_loss: 65.7707 - val_mae: 5.1473
Epoch 6/50
[1m2410/2410[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 593us/step - loss: 73.2760 - mae: 5.3168 - val_loss: 65.9184 - val_mae: 5.1745
Epoch 7/50
[1m2410/2410[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m

### Not so good, xgb is better so far. Let's upgrade the model

In [12]:
# Improved NN
model = keras.Sequential([
    layers.Dense(256, input_shape=(X_train.shape[1],)),
    layers.LeakyReLU(alpha=0.1),
    layers.BatchNormalization(),
    layers.Dropout(0.3),

    layers.Dense(128),
    layers.LeakyReLU(alpha=0.1),
    layers.BatchNormalization(),
    layers.Dropout(0.3),

    layers.Dense(64),
    layers.LeakyReLU(alpha=0.1),
    layers.BatchNormalization(),
    layers.Dropout(0.2),

    layers.Dense(32),
    layers.LeakyReLU(alpha=0.1),
    layers.BatchNormalization(),

    layers.Dense(1)  # Outer layer
])


model.compile(optimizer=keras.optimizers.Adam(learning_rate=0.001),  
              loss=keras.losses.Huber(),
              metrics=['mae'])

# Callbacks 
early_stop = keras.callbacks.EarlyStopping(monitor="val_loss", patience=10, restore_best_weights=True)
reduce_lr = keras.callbacks.ReduceLROnPlateau(monitor="val_loss", factor=0.5, patience=5, min_lr=1e-6)

history = model.fit(X_train_scaled, y_train,  
                    validation_data=(X_test_scaled, y_test),  
                    epochs=100, batch_size=32, verbose=1,
                    callbacks=[early_stop, reduce_lr])

# Train metrics
y_pred = np.clip(model.predict(X_train_scaled), a_min=0, a_max=None)

mae = mean_absolute_error(y_train, y_pred)
mse = mean_squared_error(y_train, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(np.log(1+y_train), np.log(1+y_pred))

print(f"Metrics Train (complex NN):")
print(f"MAE: {mae:.4f}")
print(f"MSE: {mse:.4f}")
print(f"RMSE: {rmse:.4f}")
print(f"R² Score (log space): {r2:.4f}")

# Test metrics
y_pred = np.clip(model.predict(X_test_scaled), a_min=0, a_max=None)

mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(np.log(1+y_test), np.log(1+y_pred))

print(f"\nMetrics Test (complex NN):")
print(f"MAE: {mae:.4f}")
print(f"MSE: {mse:.4f}")
print(f"RMSE: {rmse:.4f}")
print(f"R² Score (log space): {r2:.4f}")


Epoch 1/100


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m2410/2410[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 1ms/step - loss: 6.5060 - mae: 6.9853 - val_loss: 4.4216 - val_mae: 4.8932 - learning_rate: 0.0010
Epoch 2/100
[1m2410/2410[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 1ms/step - loss: 4.5195 - mae: 4.9900 - val_loss: 4.4151 - val_mae: 4.8855 - learning_rate: 0.0010
Epoch 3/100
[1m2410/2410[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 1ms/step - loss: 4.5274 - mae: 4.9982 - val_loss: 4.4262 - val_mae: 4.8977 - learning_rate: 0.0010
Epoch 4/100
[1m2410/2410[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 1ms/step - loss: 4.5459 - mae: 5.0173 - val_loss: 4.3963 - val_mae: 4.8673 - learning_rate: 0.0010
Epoch 5/100
[1m2410/2410[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 1ms/step - loss: 4.5178 - mae: 4.9876 - val_loss: 4.4082 - val_mae: 4.8788 - learning_rate: 0.0010
Epoch 6/100
[1m2410/2410[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 1ms/step - loss: 4.5004 - mae: 4.9714

# Even better than XGboost and more stability. Out best model