In [None]:
import numpy as np
import pandas as pd
import geopandas as gpd
import statsmodels.api as sm
import statsmodels.formula.api as smf
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# Upload data
tw = pd.read_csv('../data/data_gwr.csv')

# Add population offset
tw['log_pop'] = np.log(tw['total_pop'])

variables = ['built_up_area', 'elevation']
variable_names = ['Built-up area', 'Elevation']
tw[variables].describe()

In [None]:
# Check the distribution of case counts
plt.figure(figsize=(10, 6))
sns.histplot(tw['case_count'], bins=40)
plt.title('Distribution of dengue cases')
plt.xlabel('Count')
plt.ylabel('Frequency')

In [None]:
# Plot the relationships between dengue cases and non-climatic variables
plt.figure(figsize=(12, 5))

for i, variable in enumerate(variables):
    plt.subplot(1, 2, i+1)
    plt.scatter(tw[variable], tw['case_count'], alpha=.5)
    plt.xlabel(variable_names[i]) 
    plt.ylabel('Case count')
    plt.tight_layout()

In [None]:
# Check for overdispersion
mean_count = tw['case_count'].mean()
var_count = tw['case_count'].var()
dispersion = var_count / mean_count

print(f"Mean: {mean_count:.2f}")
print(f"Variance: {var_count:.2f}")
print(f"Dispersion ratio (variance/mean): {dispersion:.2f}")

In [None]:
# Define formula with non-climatic factors
nc_formula = "case_count ~ built_up_area + elevation"

# Model 1: Basic model
nbi_model = smf.glm(
    formula=nc_formula, 
    data=tw,
    family=sm.families.NegativeBinomial()
)
results1 = nbi_model.fit()
print("\nNegative Binomial Model Results (without offset):")
print(results1.summary())

# Model 2: With population offset 
nbi_model_pop = smf.glm(
    formula=nc_formula, 
    data=tw,
    family=sm.families.NegativeBinomial(),
    offset=tw['log_pop']
)
results2 = nbi_model_pop.fit()
print("\nNegative Binomial Model Results (with population offset):")
print(results2.summary())

In [None]:
# Examine residuals for model diagnostics
results = [results1, results2]
model_names = ['Negative Binomial Model (without offset)', 'Negative Binomial Model (with population offset)']

for i, result in enumerate(results):
    f, ax = plt.subplots(1, 4, figsize=(20, 5))

    # Generate predicted values
    tw['predicted'] = result.predict()

    # Residuals
    ax[0].scatter(tw['predicted'], tw['case_count'] - tw['predicted'], alpha=.5)
    ax[0].axhline(y=0, color='r', linestyle='-')
    ax[0].set_title('Residuals vs Predicted')
    ax[0].set_xlabel('Predicted values')
    ax[0].set_ylabel('Residuals')

    # Actual vs predicted
    ax[1].scatter(tw['case_count'], tw['predicted'], alpha=.5)
    max_val = max(tw['case_count'].max(), tw['predicted'].max())
    ax[1].plot([0, max_val], [0, max_val], 'r--')
    ax[1].set_title('Predicted vs Actual values')
    ax[1].set_xlabel('Actual values')
    ax[1].set_ylabel('Predicted values')

    # Q-Q plot
    sm.qqplot(tw['case_count'] - tw['predicted'], line='45', ax=ax[2], alpha=.5)
    ax[2].set_title('Q-Q plot of residuals')

    # Histogram of residuals
    ax[3].hist(tw['case_count'] - tw['predicted'], bins=20)
    ax[3].set_title('Histogram of residuals')
    ax[3].set_xlabel('Residuals')

    plt.suptitle(f"Diagnostics for {model_names[i]}", fontsize=16)
    plt.tight_layout()