In [0]:
import statsmodels.api as sm
import numpy as np
import pandas as pd
import geopandas as gpd
import matplotlib.pyplot as plt
from sklearn.metrics import mean_absolute_error, mean_squared_error

In [0]:
# Importamos los datos

data = spark.table("poctesting.points_gold").toPandas()

In [0]:
# Definimos la fución que calcula el pseudo-R2

def pseudo_r2(y, y_pred):
    return 1 - ((np.sum((y - y_pred)**2)) / np.sum((y - np.mean(y))**2))

# print("Pseudo R²:", pseudo_r2(y, y_pred)) # ejemplo de uso

In [0]:
data

### 1. Dividir datos en entrenamiento y test

In [0]:
from sklearn.model_selection import train_test_split

# Variables independientes
X = pd.get_dummies(data[['district', 'cluster', 'event_day', 'event_hour']], drop_first=True).astype(float)

# Variable dependiente
y = data['quantity_products'].values

# División 70% entrenamiento, 30% test
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y>0  # estratificar para mantener proporciones
)


### 2. Modelo Poisson clásico (Statsmodels)

In [0]:
import statsmodels.api as sm

# Entrenamiento
X_train_sm = sm.add_constant(X_train)
poisson_model = sm.GLM(y_train, X_train_sm, family=sm.families.Poisson())
poisson_results = poisson_model.fit()

print(poisson_results.summary())

# Predicciones
X_test_sm = sm.add_constant(X_test, has_constant="add")
y_pred_test_sm = poisson_results.predict(X_test_sm)

# Evaluación
print("MAE Test (Statsmodels):", mean_absolute_error(y_test, y_pred_test_sm))
print("RMSE Test (Statsmodels):", np.sqrt(mean_squared_error(y_test, y_pred_test_sm)))
print("Pseudo R² Test (Statsmodels):", pseudo_r2(y_test, y_pred_test_sm))


In [0]:
X_test

### 3. Modelo Poisson Regularizado (sklearn)

In [0]:
from sklearn.linear_model import PoissonRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
import numpy as np

# Entrenamiento
model_reg = PoissonRegressor(alpha=1.0, max_iter=1000)
model_reg.fit(X_train, y_train)

# Predicciones
y_pred_train = model_reg.predict(X_train)
y_pred_test = model_reg.predict(X_test)

# Evaluación
print("MAE Train:", mean_absolute_error(y_train, y_pred_train))
print("MAE Test:", mean_absolute_error(y_test, y_pred_test))
print("RMSE Test:", np.sqrt(mean_squared_error(y_test, y_pred_test)))
print("Pseudo R² Test (sklearn):", pseudo_r2(y_test, y_pred_test))


In [0]:
import matplotlib.pyplot as plt

plt.figure(figsize=(8,6))
plt.scatter(y_test, y_pred_test, alpha=0.3, label="Regularizado (sklearn)")
plt.scatter(y_test, y_pred_test_sm, alpha=0.3, label="Statsmodels", marker="x")
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--')
plt.xlabel("Ventas observadas")
plt.ylabel("Ventas predichas")
plt.title("Comparación: Poisson clásico vs Regularizado")
plt.legend()
plt.show()


In [0]:
# Predicciones

