In [None]:
# Survival Modeling for Interconnection Queue Analysis
# Author: Justin Candler / Nous Enterprise
# Description: Estimate project survival probabilities, compute entropy metrics, and visualize queue survival patterns across regions, vintages, and technologies.

# --- Cell 1: Imports ---
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from lifelines import KaplanMeierFitter
from scipy.stats import entropy

In [None]:
# --- Cell 2: Load Data ---
# Update this path to point to the appropriate queue data CSV
df = pd.read_csv('../data/queues_2023_clean_data_r1.csv')

# Expected columns: ['ProjectID', 'ISO', 'TechType', 'QueueDate', 'CODDate', 'Status']

In [None]:
# --- Cell 3: Preprocessing ---
df['QueueDate'] = pd.to_datetime(df['QueueDate'])
df['CODDate'] = pd.to_datetime(df['CODDate'])
df['SurvivalTime'] = (df['CODDate'] - df['QueueDate']).dt.days
df['Event'] = df['Status'].apply(lambda x: 1 if x == 'Operational' else 0)

# Drop NaNs or invalid entries
df = df.dropna(subset=['SurvivalTime', 'Event'])

In [None]:
# --- Cell 4: Kaplan-Meier Survival Estimation ---
kmf = KaplanMeierFitter()
kmf.fit(df['SurvivalTime'], event_observed=df['Event'])

# Plot
plt.figure(figsize=(10,6))
kmf.plot()
plt.title('Kaplan-Meier Survival Curve (All Projects)')
plt.xlabel('Days in Queue')
plt.ylabel('Survival Probability')
plt.grid(True)
plt.show()

In [None]:
# --- Cell 5: Survival by Technology Type ---
plt.figure(figsize=(12, 8))
for tech in df['TechType'].unique():
    subset = df[df['TechType'] == tech]
    if len(subset) > 20:  # minimum threshold
        kmf.fit(subset['SurvivalTime'], event_observed=subset['Event'], label=tech)
        kmf.plot()

plt.title('Survival Curves by Technology Type')
plt.xlabel('Days in Queue')
plt.ylabel('Probability of Survival')
plt.grid(True)
plt.legend()
plt.show()

In [None]:
# --- Cell 6: Entropy Estimation by ISO and Vintage ---
def compute_entropy(group):
    counts = group['Event'].value_counts(normalize=True)
    return entropy(counts, base=2)

df['QueueYear'] = df['QueueDate'].dt.year
entropy_df = df.groupby(['ISO', 'QueueYear']).apply(compute_entropy).reset_index()
entropy_df.columns = ['ISO', 'QueueYear', 'SurvivalEntropy']

# Plot
plt.figure(figsize=(12, 6))
sns.lineplot(data=entropy_df, x='QueueYear', y='SurvivalEntropy', hue='ISO', marker='o')
plt.title('Queue Survival Entropy (bits) by ISO and Year')
plt.ylabel('Entropy (bits)')
plt.xlabel('Queue Year')
plt.grid(True)
plt.show()

In [None]:
# --- Cell 7: Export Processed Data ---
entropy_df.to_csv('../outputs/survival_entropy_report.csv', index=False)
print("Saved entropy metrics to outputs/survival_entropy_report.csv")