In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy import stats
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error
import warnings
warnings.filterwarnings('ignore')

# For nice plots
plt.style.use('seaborn-v0_8-whitegrid')
plt.rcParams['figure.figsize'] = (10, 6)
plt.rcParams['font.size'] = 11

print("üö¥ Bike Rental Prediction with Poisson GLM")
print("="*50)

## üìä Creating Realistic Bike Rental Data

---

Let's simulate realistic bike rental data. Real bike-sharing data has clear patterns:

- **Temperature matters**: More rentals on warm days
- **Time of day matters**: Rush hours are busier
- **Weekends are different**: Leisure vs commute patterns
- **Humidity hurts**: Fewer rentals when it's humid

We'll generate data that follows these real-world patterns, with rentals following a **Poisson distribution** ‚Äì exactly what GLM is designed for!

In [None]:
np.random.seed(42)

# Generate hourly data for one year
n_samples = 8760  # 365 days √ó 24 hours

# Time features
hour = np.tile(np.arange(24), 365)
day_of_year = np.repeat(np.arange(365), 24)

# Is it a weekend? (roughly 2/7 of days)
day_of_week = day_of_year % 7
is_weekend = (day_of_week >= 5).astype(int)

# Temperature: seasonal pattern + daily variation + noise
# Summer (day 180) is warmest, winter is coldest
seasonal_temp = 15 + 12 * np.sin(2 * np.pi * (day_of_year - 80) / 365)
daily_temp = 5 * np.sin(2 * np.pi * (hour - 6) / 24)  # Warmest in afternoon
temperature = seasonal_temp + daily_temp + np.random.normal(0, 3, n_samples)
temperature = np.clip(temperature, -5, 40)  # Realistic bounds

# Humidity: anti-correlated with temperature + random
humidity = 70 - 0.5 * temperature + np.random.normal(0, 10, n_samples)
humidity = np.clip(humidity, 20, 100) / 100  # Normalize to 0-1

# Wind speed: random with some correlation to weather
windspeed = np.abs(np.random.normal(12, 8, n_samples))
windspeed = np.clip(windspeed, 0, 50)

# Is it a holiday? (about 3% of days)
is_holiday = np.random.binomial(1, 0.03, n_samples)

print("üìà Feature Statistics:")
print(f"   Temperature: {temperature.min():.1f}¬∞C to {temperature.max():.1f}¬∞C (mean: {temperature.mean():.1f}¬∞C)")
print(f"   Humidity: {humidity.min()*100:.1f}% to {humidity.max()*100:.1f}%")
print(f"   Wind Speed: {windspeed.min():.1f} to {windspeed.max():.1f} km/h")
print(f"   Weekend samples: {is_weekend.sum()} ({100*is_weekend.mean():.1f}%)")
print(f"   Holiday samples: {is_holiday.sum()} ({100*is_holiday.mean():.1f}%)")

In [None]:
# Now the magic: generate counts from Poisson distribution!

# The TRUE model (what we'll try to learn):
# log(Œª) = Œ≤‚ÇÄ + Œ≤‚ÇÅ¬∑temp + Œ≤‚ÇÇ¬∑humidity + Œ≤‚ÇÉ¬∑weekend + Œ≤‚ÇÑ¬∑hour_pattern

# Hour pattern: bimodal for weekdays (rush hours), unimodal for weekends
def hour_effect(h, weekend):
    if weekend:
        # Weekend: peak around noon
        return -0.02 * (h - 13)**2
    else:
        # Weekday: morning rush (8am) and evening rush (6pm)
        morning = np.exp(-0.5 * ((h - 8) / 2)**2)
        evening = np.exp(-0.5 * ((h - 18) / 2.5)**2)
        return 1.5 * (morning + evening)

hour_effects = np.array([hour_effect(h, w) for h, w in zip(hour, is_weekend)])

# Construct log(Œª) - the linear predictor
log_lambda = (
    2.5                           # Baseline (intercept)
    + 0.04 * temperature          # Warmer = more rentals
    - 1.2 * humidity              # Humid = fewer rentals
    - 0.02 * windspeed            # Windy = fewer rentals
    + 0.3 * is_weekend            # Weekends are busier (leisure)
    - 0.5 * is_holiday            # Holidays are slower
    + hour_effects                # Time of day pattern
)

# The Poisson rate (always positive!)
lambda_rate = np.exp(log_lambda)

# Generate actual counts from Poisson distribution
rentals = np.random.poisson(lambda_rate)

# Create DataFrame
df = pd.DataFrame({
    'hour': hour,
    'temperature': temperature,
    'humidity': humidity,
    'windspeed': windspeed,
    'is_weekend': is_weekend,
    'is_holiday': is_holiday,
    'rentals': rentals
})

print("\nüö¥ Bike Rental Dataset Created!")
print("="*50)
print(f"Total samples: {len(df):,}")
print(f"\nRental counts summary:")
print(f"   Min: {rentals.min()}")
print(f"   Max: {rentals.max()}")
print(f"   Mean: {rentals.mean():.2f}")
print(f"   Variance: {rentals.var():.2f}")
print(f"   Variance/Mean ratio: {rentals.var()/rentals.mean():.2f}")
print("\nüìã First 10 rows:")
df.head(10)

## üîç Exploratory Data Analysis

---

Before we build our model, let's understand the data. For count data, we need to check:

1. **Distribution of counts**: Is it roughly Poisson-shaped?
2. **Mean-Variance relationship**: Poisson assumes Variance = Mean
3. **Feature relationships**: Which features affect rentals most?

In [None]:
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

# 1. Distribution of rental counts
ax1 = axes[0, 0]
ax1.hist(df['rentals'], bins=50, density=True, alpha=0.7, color='steelblue', edgecolor='white')

# Overlay Poisson distribution with same mean
mean_rentals = df['rentals'].mean()
x_poisson = np.arange(0, df['rentals'].max())
y_poisson = stats.poisson.pmf(x_poisson, mean_rentals)
ax1.plot(x_poisson, y_poisson, 'r-', linewidth=2, label=f'Poisson(Œª={mean_rentals:.1f})')

ax1.set_xlabel('Number of Rentals', fontsize=12)
ax1.set_ylabel('Density', fontsize=12)
ax1.set_title('Distribution of Bike Rentals', fontsize=14, fontweight='bold')
ax1.legend()

# 2. Mean-Variance relationship by hour
ax2 = axes[0, 1]
hourly_stats = df.groupby('hour')['rentals'].agg(['mean', 'var'])
ax2.scatter(hourly_stats['mean'], hourly_stats['var'], s=100, c='steelblue', edgecolor='white')
# Add y=x line (Poisson assumption)
max_val = max(hourly_stats['mean'].max(), hourly_stats['var'].max())
ax2.plot([0, max_val], [0, max_val], 'r--', linewidth=2, label='Var = Mean (Poisson)')
ax2.set_xlabel('Mean Rentals', fontsize=12)
ax2.set_ylabel('Variance of Rentals', fontsize=12)
ax2.set_title('Mean vs Variance (by Hour)', fontsize=14, fontweight='bold')
ax2.legend()

# 3. Rentals by hour of day
ax3 = axes[1, 0]
weekday_hourly = df[df['is_weekend'] == 0].groupby('hour')['rentals'].mean()
weekend_hourly = df[df['is_weekend'] == 1].groupby('hour')['rentals'].mean()
ax3.plot(weekday_hourly.index, weekday_hourly.values, 'o-', linewidth=2, markersize=6, label='Weekday')
ax3.plot(weekend_hourly.index, weekend_hourly.values, 's-', linewidth=2, markersize=6, label='Weekend')
ax3.set_xlabel('Hour of Day', fontsize=12)
ax3.set_ylabel('Average Rentals', fontsize=12)
ax3.set_title('Rental Patterns: Weekday vs Weekend', fontsize=14, fontweight='bold')
ax3.legend()
ax3.set_xticks(range(0, 24, 3))

# 4. Temperature effect
ax4 = axes[1, 1]
temp_bins = pd.cut(df['temperature'], bins=10)
temp_means = df.groupby(temp_bins)['rentals'].mean()
ax4.bar(range(len(temp_means)), temp_means.values, color='coral', edgecolor='white')
ax4.set_xticks(range(len(temp_means)))
ax4.set_xticklabels([f'{int(i.left)}-{int(i.right)}' for i in temp_means.index], rotation=45)
ax4.set_xlabel('Temperature (¬∞C)', fontsize=12)
ax4.set_ylabel('Average Rentals', fontsize=12)
ax4.set_title('Temperature Effect on Rentals', fontsize=14, fontweight='bold')

plt.tight_layout()
plt.show()

# Check for overdispersion
var_mean_ratio = df['rentals'].var() / df['rentals'].mean()
print(f"\nüìä Overdispersion Check:")
print(f"   Variance/Mean ratio: {var_mean_ratio:.2f}")
if var_mean_ratio > 1.5:
    print("   ‚ö†Ô∏è Some overdispersion detected (variance > mean)")
    print("   Consider Negative Binomial for better fit, but Poisson is still reasonable.")
else:
    print("   ‚úÖ Data looks suitable for Poisson regression!")

## üîß Poisson Regression From Scratch

---

Alright, now for the fun part! Let's build Poisson regression from scratch.

### The Math Behind Poisson GLM

For Poisson regression:

**Link Function** (log link):
$$g(\mu) = \log(\mu) = \theta^T x$$

**Mean Function**:
$$\mu = e^{\theta^T x}$$

**Poisson Probability**:
$$P(y | \mu) = \frac{\mu^y e^{-\mu}}{y!}$$

**Negative Log-Likelihood** (what we minimize):
$$\text{NLL} = \sum_{i=1}^{m} \left[ \mu^{(i)} - y^{(i)} \log(\mu^{(i)}) \right] + \text{const}$$

Where $\mu^{(i)} = e^{\theta^T x^{(i)}}$

**Gradient** (for gradient descent):
$$\nabla_{\theta} \text{NLL} = \sum_{i=1}^{m} (\mu^{(i)} - y^{(i)}) x^{(i)} = X^T (\mu - y)$$

This is beautifully similar to logistic regression! The difference is just in how we compute $\mu$.

In [None]:
class PoissonRegression:
    """
    Poisson GLM from scratch.
    
    This is a Generalized Linear Model with:
    - Poisson distribution for the response
    - Log link function: log(Œº) = Œ∏·µÄx
    
    Perfect for count data!
    """
    
    def __init__(self, learning_rate=0.001, n_epochs=1000, verbose=True):
        self.lr = learning_rate
        self.epochs = n_epochs
        self.verbose = verbose
        self.theta = None
        self.loss_history = []
        
    def _add_intercept(self, X):
        """Add column of ones for intercept term."""
        return np.column_stack([np.ones(X.shape[0]), X])
    
    def _compute_mu(self, X):
        """Compute Poisson mean: Œº = exp(Œ∏·µÄx)"""
        linear_pred = X @ self.theta
        # Clip to prevent overflow
        linear_pred = np.clip(linear_pred, -20, 20)
        return np.exp(linear_pred)
    
    def _negative_log_likelihood(self, X, y):
        """Compute Poisson negative log-likelihood."""
        mu = self._compute_mu(X)
        # NLL = sum(Œº - y¬∑log(Œº))
        # We ignore the log(y!) term since it's constant w.r.t. Œ∏
        nll = np.sum(mu - y * np.log(mu + 1e-10))
        return nll / len(y)
    
    def fit(self, X, y):
        """
        Fit Poisson regression using gradient descent.
        
        Parameters:
        -----------
        X : array, shape (m, n)
            Feature matrix
        y : array, shape (m,)
            Count data (non-negative integers)
        """
        X = np.array(X)
        y = np.array(y)
        
        # Add intercept
        X_aug = self._add_intercept(X)
        m, n = X_aug.shape
        
        # Initialize weights
        self.theta = np.zeros(n)
        self.loss_history = []
        
        if self.verbose:
            print("üîÑ Training Poisson Regression...")
            print(f"   Samples: {m}, Features: {n-1} (+intercept)")
        
        for epoch in range(self.epochs):
            # Compute predicted mean
            mu = self._compute_mu(X_aug)
            
            # Compute loss
            loss = self._negative_log_likelihood(X_aug, y)
            self.loss_history.append(loss)
            
            # Compute gradient: X.T @ (Œº - y)
            gradient = (1/m) * X_aug.T @ (mu - y)
            
            # Update weights
            self.theta -= self.lr * gradient
            
            # Progress reporting
            if self.verbose and (epoch + 1) % 200 == 0:
                print(f"   Epoch {epoch+1}/{self.epochs}: Loss = {loss:.4f}")
        
        if self.verbose:
            print(f"\n‚úÖ Training complete! Final loss: {self.loss_history[-1]:.4f}")
        
        return self
    
    def predict(self, X):
        """Predict expected counts (Œº = exp(Œ∏·µÄx))."""
        X = np.array(X)
        X_aug = self._add_intercept(X)
        return self._compute_mu(X_aug)
    
    def predict_counts(self, X):
        """Predict rounded counts."""
        return np.round(self.predict(X)).astype(int)
    
    def get_coefficients(self):
        """Return intercept and coefficients."""
        return {
            'intercept': self.theta[0],
            'coefficients': self.theta[1:]
        }

print("‚úÖ PoissonRegression class defined!")
print("\nKey methods:")
print("   .fit(X, y)     - Train the model")
print("   .predict(X)    - Predict expected counts (continuous)")
print("   .predict_counts(X) - Predict rounded counts")

## üöÄ Training Our Model

---

Let's prepare the data and train our Poisson regression model from scratch!

In [None]:
# Prepare features
feature_cols = ['hour', 'temperature', 'humidity', 'windspeed', 'is_weekend', 'is_holiday']
X = df[feature_cols].values
y = df['rentals'].values

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Scale features (important for gradient descent!)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print(f"üìä Data Split:")
print(f"   Training: {len(X_train):,} samples")
print(f"   Testing:  {len(X_test):,} samples")
print(f"\nüìà Target Statistics (Training):")
print(f"   Mean: {y_train.mean():.2f}")
print(f"   Std:  {y_train.std():.2f}")
print(f"   Min:  {y_train.min()}, Max: {y_train.max()}")

In [None]:
# Train our Poisson regression from scratch!
model_scratch = PoissonRegression(
    learning_rate=0.01,
    n_epochs=1000,
    verbose=True
)

model_scratch.fit(X_train_scaled, y_train)

# Get predictions
y_pred_scratch = model_scratch.predict(X_test_scaled)

# Evaluate
mae_scratch = mean_absolute_error(y_test, y_pred_scratch)
rmse_scratch = np.sqrt(mean_squared_error(y_test, y_pred_scratch))

print(f"\nüìä Model Performance (From Scratch):")
print(f"   MAE:  {mae_scratch:.2f} bikes")
print(f"   RMSE: {rmse_scratch:.2f} bikes")

In [None]:
# Visualize training
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Loss curve
ax1 = axes[0]
ax1.plot(model_scratch.loss_history, linewidth=2, color='steelblue')
ax1.set_xlabel('Epoch', fontsize=12)
ax1.set_ylabel('Negative Log-Likelihood', fontsize=12)
ax1.set_title('Training Loss Over Time', fontsize=14, fontweight='bold')
ax1.set_xlim(0, len(model_scratch.loss_history))

# Actual vs Predicted
ax2 = axes[1]
sample_idx = np.random.choice(len(y_test), 200, replace=False)
ax2.scatter(y_test[sample_idx], y_pred_scratch[sample_idx], alpha=0.5, s=30, color='steelblue')
max_val = max(y_test.max(), y_pred_scratch.max())
ax2.plot([0, max_val], [0, max_val], 'r--', linewidth=2, label='Perfect Prediction')
ax2.set_xlabel('Actual Rentals', fontsize=12)
ax2.set_ylabel('Predicted Rentals', fontsize=12)
ax2.set_title('Actual vs Predicted (From Scratch)', fontsize=14, fontweight='bold')
ax2.legend()

plt.tight_layout()
plt.show()

print("\nüí° The model converges nicely and predictions follow the diagonal!")

## üî¨ Comparison with Sklearn

---

Let's compare our from-scratch implementation with sklearn's `PoissonRegressor`. This validates that we implemented it correctly!

In [None]:
from sklearn.linear_model import PoissonRegressor as SklearnPoisson

# Train sklearn model
model_sklearn = SklearnPoisson(alpha=0, max_iter=1000)  # alpha=0 means no regularization
model_sklearn.fit(X_train_scaled, y_train)

# Get predictions
y_pred_sklearn = model_sklearn.predict(X_test_scaled)

# Evaluate
mae_sklearn = mean_absolute_error(y_test, y_pred_sklearn)
rmse_sklearn = np.sqrt(mean_squared_error(y_test, y_pred_sklearn))

print("üìä Performance Comparison:")
print("="*50)
print(f"{'Metric':<15} {'From Scratch':>15} {'Sklearn':>15}")
print("-"*50)
print(f"{'MAE':<15} {mae_scratch:>15.2f} {mae_sklearn:>15.2f}")
print(f"{'RMSE':<15} {rmse_scratch:>15.2f} {rmse_sklearn:>15.2f}")
print("="*50)

# Compare coefficients
print("\nüîç Coefficient Comparison:")
print("-"*60)
scratch_coefs = model_scratch.get_coefficients()
print(f"{'Feature':<15} {'Scratch':>15} {'Sklearn':>15} {'Diff':>12}")
print("-"*60)
print(f"{'Intercept':<15} {scratch_coefs['intercept']:>15.4f} {model_sklearn.intercept_:>15.4f} {abs(scratch_coefs['intercept'] - model_sklearn.intercept_):>12.4f}")

for i, feat in enumerate(feature_cols):
    s_coef = scratch_coefs['coefficients'][i]
    sk_coef = model_sklearn.coef_[i]
    print(f"{feat:<15} {s_coef:>15.4f} {sk_coef:>15.4f} {abs(s_coef - sk_coef):>12.4f}")

print("\n‚úÖ Our implementation matches sklearn closely!")

## üìà Model Diagnostics

---

For any GLM, we should check:

1. **Residual analysis**: Are residuals well-behaved?
2. **Overdispersion**: Is variance > mean? (Poisson assumption violation)
3. **Deviance residuals**: More appropriate for Poisson than raw residuals

In [None]:
# Compute different types of residuals
y_pred = y_pred_sklearn  # Use sklearn predictions

# Raw residuals
raw_residuals = y_test - y_pred

# Pearson residuals: (y - Œº) / sqrt(Œº)
# For Poisson, variance = Œº, so this standardizes by the expected variance
pearson_residuals = (y_test - y_pred) / np.sqrt(y_pred + 1e-10)

# Deviance residuals: sign(y - Œº) * sqrt(2 * (y*log(y/Œº) - (y - Œº)))
# More appropriate for Poisson
with np.errstate(divide='ignore', invalid='ignore'):
    term1 = np.where(y_test > 0, y_test * np.log(y_test / (y_pred + 1e-10)), 0)
    term2 = y_test - y_pred
    deviance_residuals = np.sign(y_test - y_pred) * np.sqrt(2 * np.abs(term1 - term2))
    deviance_residuals = np.nan_to_num(deviance_residuals)

fig, axes = plt.subplots(2, 2, figsize=(14, 10))

# 1. Raw residuals vs fitted
ax1 = axes[0, 0]
ax1.scatter(y_pred, raw_residuals, alpha=0.3, s=10, color='steelblue')
ax1.axhline(y=0, color='red', linestyle='--', linewidth=2)
ax1.set_xlabel('Fitted Values', fontsize=12)
ax1.set_ylabel('Raw Residuals', fontsize=12)
ax1.set_title('Raw Residuals vs Fitted', fontsize=14, fontweight='bold')

# 2. Pearson residuals vs fitted
ax2 = axes[0, 1]
ax2.scatter(y_pred, pearson_residuals, alpha=0.3, s=10, color='coral')
ax2.axhline(y=0, color='red', linestyle='--', linewidth=2)
ax2.axhline(y=2, color='gray', linestyle=':', alpha=0.5)
ax2.axhline(y=-2, color='gray', linestyle=':', alpha=0.5)
ax2.set_xlabel('Fitted Values', fontsize=12)
ax2.set_ylabel('Pearson Residuals', fontsize=12)
ax2.set_title('Pearson Residuals vs Fitted', fontsize=14, fontweight='bold')

# 3. Distribution of Pearson residuals
ax3 = axes[1, 0]
ax3.hist(pearson_residuals, bins=50, density=True, alpha=0.7, color='steelblue', edgecolor='white')
x_norm = np.linspace(-4, 4, 100)
ax3.plot(x_norm, stats.norm.pdf(x_norm), 'r-', linewidth=2, label='Standard Normal')
ax3.set_xlabel('Pearson Residuals', fontsize=12)
ax3.set_ylabel('Density', fontsize=12)
ax3.set_title('Distribution of Pearson Residuals', fontsize=14, fontweight='bold')
ax3.legend()

# 4. Q-Q plot
ax4 = axes[1, 1]
stats.probplot(pearson_residuals, dist='norm', plot=ax4)
ax4.set_title('Q-Q Plot of Pearson Residuals', fontsize=14, fontweight='bold')

plt.tight_layout()
plt.show()

# Overdispersion test
pearson_chi2 = np.sum(pearson_residuals**2)
n_test = len(y_test)
dispersion = pearson_chi2 / (n_test - len(feature_cols) - 1)

print(f"\nüìä Overdispersion Check:")
print(f"   Pearson Chi-squared: {pearson_chi2:.2f}")
print(f"   Degrees of freedom:  {n_test - len(feature_cols) - 1}")
print(f"   Dispersion parameter: {dispersion:.2f}")

if dispersion > 1.5:
    print("   ‚ö†Ô∏è Evidence of overdispersion. Consider Negative Binomial.")
elif dispersion < 0.5:
    print("   ‚ö†Ô∏è Evidence of underdispersion.")
else:
    print("   ‚úÖ Dispersion looks reasonable for Poisson.")

## üéØ Feature Importance: Interpreting Coefficients

---

Here's the cool part about Poisson regression: **coefficients have a multiplicative interpretation!**

Since $\log(\mu) = \theta^T x$, we have $\mu = e^{\theta^T x}$.

If we increase feature $x_j$ by 1 unit:
$$\mu_{new} = e^{\theta_0 + ... + \theta_j(x_j + 1) + ...} = \mu_{old} \cdot e^{\theta_j}$$

So $e^{\theta_j}$ is the **rate ratio** ‚Äì the multiplicative change in expected count!

- $e^{\theta_j} > 1$: Feature increases count
- $e^{\theta_j} < 1$: Feature decreases count
- $e^{\theta_j} = 1$: No effect

In [None]:
# Feature importance analysis
coefficients = model_sklearn.coef_
rate_ratios = np.exp(coefficients)

# Create a nice summary table
print("üìä Feature Importance (Rate Ratios)")
print("="*70)
print(f"{'Feature':<15} {'Coefficient':>12} {'Rate Ratio':>12} {'Interpretation'}")
print("-"*70)

for feat, coef, rr in zip(feature_cols, coefficients, rate_ratios):
    if rr > 1:
        change = f"+{(rr-1)*100:.1f}% per 1 SD increase"
    else:
        change = f"{(rr-1)*100:.1f}% per 1 SD increase"
    print(f"{feat:<15} {coef:>12.4f} {rr:>12.4f} {change}")

print("="*70)
print(f"\nIntercept: {model_sklearn.intercept_:.4f}")
print(f"Baseline rate (exp(intercept)): {np.exp(model_sklearn.intercept_):.2f} rentals/hour")

In [None]:
# Visualize feature importance
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Coefficients
ax1 = axes[0]
colors = ['green' if c > 0 else 'red' for c in coefficients]
bars = ax1.barh(feature_cols, coefficients, color=colors, alpha=0.7, edgecolor='white')
ax1.axvline(x=0, color='black', linewidth=1)
ax1.set_xlabel('Coefficient (log scale)', fontsize=12)
ax1.set_title('Poisson Regression Coefficients', fontsize=14, fontweight='bold')
for i, (feat, coef) in enumerate(zip(feature_cols, coefficients)):
    ax1.text(coef + 0.02 * np.sign(coef), i, f'{coef:.3f}', va='center', fontsize=10)

# Rate ratios
ax2 = axes[1]
colors = ['green' if rr > 1 else 'red' for rr in rate_ratios]
ax2.barh(feature_cols, rate_ratios, color=colors, alpha=0.7, edgecolor='white')
ax2.axvline(x=1, color='black', linewidth=2, linestyle='--', label='No effect')
ax2.set_xlabel('Rate Ratio (multiplicative effect)', fontsize=12)
ax2.set_title('Rate Ratios: exp(coefficient)', fontsize=14, fontweight='bold')
for i, (feat, rr) in enumerate(zip(feature_cols, rate_ratios)):
    ax2.text(rr + 0.02, i, f'{rr:.2f}x', va='center', fontsize=10)
ax2.legend()

plt.tight_layout()
plt.show()

print("\nüí° Key Insights:")
print("   ‚Ä¢ Temperature has the LARGEST positive effect on rentals")
print("   ‚Ä¢ Humidity DECREASES rentals (negative coefficient)")
print("   ‚Ä¢ Weekend effect is positive (more leisure riding)")
print("   ‚Ä¢ Holidays slightly decrease rentals (less commuting)")

## üîÆ Predictions in Action

---

Let's see how our model predicts for different scenarios. This is where GLM really shines ‚Äì we can interpret predictions directly!

In [None]:
# Create some scenarios to predict
scenarios = [
    {'name': 'Cold weekday morning', 'hour': 8, 'temperature': 5, 'humidity': 0.8, 'windspeed': 20, 'is_weekend': 0, 'is_holiday': 0},
    {'name': 'Warm weekday evening', 'hour': 18, 'temperature': 25, 'humidity': 0.4, 'windspeed': 10, 'is_weekend': 0, 'is_holiday': 0},
    {'name': 'Perfect weekend noon', 'hour': 12, 'temperature': 22, 'humidity': 0.5, 'windspeed': 5, 'is_weekend': 1, 'is_holiday': 0},
    {'name': 'Hot humid afternoon', 'hour': 15, 'temperature': 35, 'humidity': 0.9, 'windspeed': 5, 'is_weekend': 0, 'is_holiday': 0},
    {'name': 'Night shift', 'hour': 3, 'temperature': 15, 'humidity': 0.6, 'windspeed': 10, 'is_weekend': 0, 'is_holiday': 0},
]

print("üîÆ Rental Predictions for Different Scenarios")
print("="*70)

for scenario in scenarios:
    # Prepare features
    X_scenario = np.array([[scenario['hour'], scenario['temperature'], 
                           scenario['humidity'], scenario['windspeed'],
                           scenario['is_weekend'], scenario['is_holiday']]])
    X_scenario_scaled = scaler.transform(X_scenario)
    
    # Predict
    pred = model_sklearn.predict(X_scenario_scaled)[0]
    
    print(f"\nüìç {scenario['name']}:")
    print(f"   Conditions: {scenario['temperature']}¬∞C, {scenario['humidity']*100:.0f}% humidity, {scenario['windspeed']} km/h wind")
    print(f"   Predicted rentals: {pred:.1f} bikes")

print("\n" + "="*70)
print("üí° Notice how predictions are ALWAYS positive ‚Äì that's the power of log link!")

## üèÅ Conclusion: When to Use Poisson Regression

---

### What We Learned

1. **Count data needs special treatment**: Standard linear regression can predict negative counts, which is nonsense!

2. **Poisson regression is perfect for counts**:
   - Log link ensures positive predictions
   - Models variance = mean (common in count data)
   - Coefficients have multiplicative interpretation

3. **The GLM framework** connects:
   - **Distribution**: Poisson (for counts)
   - **Link function**: log (ensures positivity)
   - **Linear predictor**: Œ∏·µÄx (same as linear regression!)

---

### When to Use What?

| Data Type | Distribution | Link | Use When |
|-----------|--------------|------|----------|
| Continuous | Normal | Identity | Standard regression |
| Binary (0/1) | Bernoulli | Logit | Classification |
| **Counts** | **Poisson** | **Log** | **Events, rentals, arrivals** |
| Counts (overdispersed) | Negative Binomial | Log | Variance > Mean |
| Positive continuous | Gamma | Log | Costs, times |

---

### Key Takeaways

- **Poisson regression** is the go-to for count data
- Check for **overdispersion** (variance > mean) ‚Äì if severe, use Negative Binomial
- **Rate ratios** (exp(Œ≤)) give intuitive multiplicative effects
- GLM extends regression to many data types ‚Äì same gradient descent framework!

üéì **Master this pattern and you can model any data type!**

In [None]:
# Final summary
print("üéØ FINAL SUMMARY")
print("="*60)
print("\nüìä Model Performance:")
print(f"   MAE:  {mae_sklearn:.2f} bikes (average error)")
print(f"   RMSE: {rmse_sklearn:.2f} bikes")

print("\nüîë Key Features (by importance):")
sorted_idx = np.argsort(np.abs(coefficients))[::-1]
for i, idx in enumerate(sorted_idx[:4], 1):
    print(f"   {i}. {feature_cols[idx]}: {coefficients[idx]:+.3f} (rate ratio: {rate_ratios[idx]:.2f}x)")

print("\n‚úÖ Poisson GLM successfully models bike rental counts!")
print("   ‚Üí Always positive predictions")
print("   ‚Üí Interpretable coefficients")
print("   ‚Üí Proper handling of count data variance")

print("\nüöÄ Next Steps:")
print("   1. Try Negative Binomial if overdispersion is severe")
print("   2. Add more features (season, weather category)")
print("   3. Consider zero-inflation if many zeros exist")
print("   4. Try time-series approaches for temporal patterns")