# Ajuste de Modelos

In [2]:
import statsmodels.api as sm
import statsmodels.formula.api as smf
from statsmodels.tools.tools import add_constant
import pandas as pd

In [3]:
# warnings are ignored to avoid cluttering the output
import warnings
warnings.filterwarnings("ignore")

In [4]:
url = "https://vincentarelbundock.github.io/Rdatasets/csv/AER/RecreationDemand.csv"
df = pd.read_csv(url)
df = df.drop(columns=["rownames"])

In [5]:
df['ski'] = df['ski'].astype('category')
df['userfee'] = df['userfee'].astype('category')

### Ajuste do Modelo de Poisson

In [None]:
poisson_model_complete = smf.glm(
    formula="trips ~ quality + income + ski + userfee + costC + costS + costH",
    data=df,
    family=sm.families.Poisson(),
).fit()

print(poisson_model_complete.summary())

                 Generalized Linear Model Regression Results                  
Dep. Variable:                  trips   No. Observations:                  659
Model:                            GLM   Df Residuals:                      651
Model Family:                 Poisson   Df Model:                            7
Link Function:                    Log   Scale:                          1.0000
Method:                          IRLS   Log-Likelihood:                -1529.4
Date:                Sun, 25 May 2025   Deviance:                       2305.8
Time:                        21:15:24   Pearson chi2:                 4.10e+03
No. Iterations:                     8   Pseudo R-squ. (CS):             0.9789
Covariance Type:            nonrobust                                         
                     coef    std err          z      P>|z|      [0.025      0.975]
----------------------------------------------------------------------------------
Intercept          0.2650      0.094      2.

In [28]:
poisson_model_simple = smf.glm(
    formula="trips ~ quality + ski + userfee + costS",
    data=df,
    family=sm.families.Poisson(),
).fit()

print(poisson_model_simple.summary())

                 Generalized Linear Model Regression Results                  
Dep. Variable:                  trips   No. Observations:                  659
Model:                            GLM   Df Residuals:                      654
Model Family:                 Poisson   Df Model:                            4
Link Function:                    Log   Scale:                          1.0000
Method:                          IRLS   Log-Likelihood:                -1755.8
Date:                Sun, 25 May 2025   Deviance:                       2758.5
Time:                        21:30:16   Pearson chi2:                 6.22e+03
No. Iterations:                     6   Pseudo R-squ. (CS):             0.9581
Covariance Type:            nonrobust                                         
                     coef    std err          z      P>|z|      [0.025      0.975]
----------------------------------------------------------------------------------
Intercept          0.1026      0.073      1.

In [67]:
import numpy as np

lambda_complete = poisson_model_complete.predict(df)
lambda_simple = poisson_model_simple.predict(df)

# Probabilidade de prever zero
prob_zero_complete = np.exp(-lambda_complete)
prob_zero_simple = np.exp(-lambda_simple)

# Número esperado de zeros
expected_zeros_complete = prob_zero_complete.sum()
expected_zeros_simple = prob_zero_simple.sum()
# zeros observados
n_zeros = (df['trips'] == 0).sum()


print(f"Zeros observados: {n_zeros}")
print(f"Zeros esperados pelo modelo completo: {expected_zeros_complete:.0f}")
print(f"Zeros esperados pelo modelo simples: {expected_zeros_simple:.0f}")

Zeros observados: 417
Zeros esperados pelo modelo completo: 277
Zeros esperados pelo modelo simples: 255


### Teste de Sobredispersão

In [59]:
# Valores ajustados
mu_hat = poisson_model_simple.fittedvalues
y = df["trips"]

# Estatistica Z_i
Zi = ((y - mu_hat)**2) - y / mu_hat

# Regressão de Z_i em mu_hat (sem intercepto)
sobredisp = sm.OLS(Zi, mu_hat).fit()
print(sobredisp.summary())

                                 OLS Regression Results                                
Dep. Variable:                      y   R-squared (uncentered):                   0.030
Model:                            OLS   Adj. R-squared (uncentered):              0.029
Method:                 Least Squares   F-statistic:                              20.44
Date:                Sun, 25 May 2025   Prob (F-statistic):                    7.31e-06
Time:                        21:44:56   Log-Likelihood:                         -4694.7
No. Observations:                 659   AIC:                                      9391.
Df Residuals:                     658   BIC:                                      9396.
Df Model:                           1                                                  
Covariance Type:            nonrobust                                                  
                 coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------

In [60]:
p_value = sobredisp.pvalues[0]
coef = sobredisp.params[0]

#print(f"P-value do teste de sobredispersão: {p_value}")
#print(f"Coeficiente estimado: {coef:.1f}")

# Teste de sobredispersão
if p_value < 0.05:
    print(f"O modelo apresenta sobredispersão, com p-value ({p_value}) < 0.05 e coeficiente estimado ({coef:.1f}) > 0.")
else:
    print("Não há evidências de sobredispersão nos dados.")

O modelo apresenta sobredispersão, com p-value (7.309728588708232e-06) < 0.05 e coeficiente estimado (12.4) > 0.


## Modelos Alternativos

### Binomial Negativa

In [35]:
neg_bin_complete = smf.glm(
    formula="trips ~ quality + income + ski + userfee + costC + costS + costH",
    data=df,
    family=sm.families.NegativeBinomial(),
).fit()

print(neg_bin_complete.summary())

                 Generalized Linear Model Regression Results                  
Dep. Variable:                  trips   No. Observations:                  659
Model:                            GLM   Df Residuals:                      651
Model Family:        NegativeBinomial   Df Model:                            7
Link Function:                    Log   Scale:                          1.0000
Method:                          IRLS   Log-Likelihood:                -830.01
Date:                Sun, 25 May 2025   Deviance:                       503.97
Time:                        21:33:15   Pearson chi2:                 1.13e+03
No. Iterations:                    25   Pseudo R-squ. (CS):             0.7744
Covariance Type:            nonrobust                                         
                     coef    std err          z      P>|z|      [0.025      0.975]
----------------------------------------------------------------------------------
Intercept         -1.0085      0.194     -5.

In [39]:
neg_bin_simple = smf.glm(
    formula = "trips ~ quality + ski + userfee + costS",
    data=df,
    family=sm.families.NegativeBinomial(),
).fit()

print(neg_bin_simple.summary())

                 Generalized Linear Model Regression Results                  
Dep. Variable:                  trips   No. Observations:                  659
Model:                            GLM   Df Residuals:                      654
Model Family:        NegativeBinomial   Df Model:                            4
Link Function:                    Log   Scale:                          1.0000
Method:                          IRLS   Log-Likelihood:                -927.23
Date:                Sun, 25 May 2025   Deviance:                       698.42
Time:                        21:33:59   Pearson chi2:                 2.67e+03
No. Iterations:                    13   Pseudo R-squ. (CS):             0.6969
Covariance Type:            nonrobust                                         
                     coef    std err          z      P>|z|      [0.025      0.975]
----------------------------------------------------------------------------------
Intercept         -0.7578      0.137     -5.

In [41]:
neg_bin_model = smf.glm(
    formula = "trips ~ quality + ski + userfee + costC + costS + costH",
    data=df,
    family=sm.families.NegativeBinomial(),
).fit()

print(neg_bin_model.summary())

                 Generalized Linear Model Regression Results                  
Dep. Variable:                  trips   No. Observations:                  659
Model:                            GLM   Df Residuals:                      652
Model Family:        NegativeBinomial   Df Model:                            6
Link Function:                    Log   Scale:                          1.0000
Method:                          IRLS   Log-Likelihood:                -830.32
Date:                Sun, 25 May 2025   Deviance:                       504.59
Time:                        21:34:50   Pearson chi2:                 1.15e+03
No. Iterations:                    24   Pseudo R-squ. (CS):             0.7742
Covariance Type:            nonrobust                                         
                     coef    std err          z      P>|z|      [0.025      0.975]
----------------------------------------------------------------------------------
Intercept         -1.1109      0.151     -7.

In [43]:
# compare all models

models_comparison = pd.DataFrame({
    'Model': ['Poisson Completo', 'Poisson Simples',
                'Neg Bin Completo', 'Neg Bin Simples', 'Neg Bin sem Income'],
    'AIC': [m.aic for m in [
        poisson_model_complete, poisson_model_simple,
        neg_bin_complete, neg_bin_simple, neg_bin_model
    ]],
    'BIC': [m.bic for m in [
        poisson_model_complete, poisson_model_simple,
        neg_bin_complete, neg_bin_simple, neg_bin_model
    ]]
})

print(models_comparison.sort_values('AIC'))

                Model          AIC          BIC
4  Neg Bin sem Income  1674.631200 -3727.366185
2    Neg Bin Completo  1676.012852 -3721.493810
3     Neg Bin Simples  1864.468744 -3546.510088
0    Poisson Completo  3074.862594 -1919.675463
1     Poisson Simples  3521.533609 -1486.476618
