# 🗺️ California District Prioritization Model
## ML-Powered Account Scoring for K-8 Literacy Sales
---
**Purpose:** Score and rank all ~1,000 California school districts by likelihood to purchase  
Literacy Partners' professional development services.

### Model Output
Each district receives a **Partnership Readiness Score (0–100)** across 5 dimensions:

| Dimension | Weight | Data Source |
|-----------|--------|-------------|
| Literacy Need (test scores) | 30% | CAASPP |
| Budget Availability | 25% | EdData, USASpending |
| SOR Adoption Signal | 20% | News, board minutes |
| Leadership Openness | 15% | LinkedIn, Glassdoor |
| Geographic Access | 10% | District location |


## 🔧 Setup

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split, cross_val_score
import warnings
warnings.filterwarnings("ignore")

BRAND_COLORS = {"primary": "#2E4057", "secondary": "#048A81", "accent": "#F18F01", "danger": "#C73E1D"}
print("Setup complete.")


## 📥 Step 1: Load District Data

In [None]:
# ============================================================
# TODO (Jules): Replace with live CAASPP API + EdData scraper
# CAASPP API: https://caaspp-elpac.ets.org/caaspp/ResearchFileList
# CDE List:   https://www.cde.ca.gov/ds/si/ds/
# ============================================================

np.random.seed(42)
n = 200

districts = pd.DataFrame({
    "district_name": [f"District_{i:03d}" for i in range(n)],
    "county": np.random.choice(["Los Angeles","San Diego","Sacramento","Fresno","Orange",
                                 "Riverside","San Bernardino","Alameda","Kern","Santa Clara"], n),
    "enrollment_k8": np.random.randint(500, 80000, n),
    "pct_ela_proficient": np.random.uniform(20, 75, n),
    "pct_title1_students": np.random.uniform(10, 95, n),
    "pd_budget_per_student_est": np.random.uniform(50, 500, n),
    "sor_adoption_signal": np.random.choice(["None","Exploring","Committed","Implementing"], n,
                                             p=[0.3, 0.3, 0.25, 0.15]),
    "recent_literacy_initiative": np.random.choice([True, False], n, p=[0.4, 0.6]),
    "superintendent_tenure_yrs": np.random.uniform(0.5, 15, n),
    "teacher_turnover_rate": np.random.uniform(5, 45, n),
    "district_type": np.random.choice(["Elementary","Unified","High School"], n, p=[0.4, 0.45, 0.15]),
    "miles_from_la": np.random.uniform(0, 400, n),
})

districts = districts[districts["district_type"].isin(["Elementary", "Unified"])]
print(f"Loaded {len(districts)} K-8 relevant districts in California")
print(districts.head())


## 🧮 Step 2: Partnership Readiness Score

In [None]:
def score_district(row):
    score = 0
    # 1. Literacy Need (30%)
    score += max(0, (60 - row["pct_ela_proficient"]) / 60) * 30
    # 2. Budget (25%)
    score += min(row["pd_budget_per_student_est"] / 500, 1.0) * 25
    # 3. SOR Signal (20%)
    sor_map = {"None": 0, "Exploring": 10, "Committed": 16, "Implementing": 20}
    score += sor_map.get(row["sor_adoption_signal"], 0)
    # 4. Leadership Openness (15%)
    leadership = 0
    if row["recent_literacy_initiative"]: leadership += 8
    if row["superintendent_tenure_yrs"] < 3: leadership += 7
    score += min(leadership, 15)
    # 5. Geography (10%)
    score += max(0, (400 - row["miles_from_la"]) / 400) * 10
    return round(score, 2)

districts["partnership_readiness_score"] = districts.apply(score_district, axis=1)

def tier(s):
    if s >= 70: return "Tier 1 — Immediate Outreach"
    elif s >= 50: return "Tier 2 — Nurture"
    return "Tier 3 — Monitor"

districts["tier"] = districts["partnership_readiness_score"].apply(tier)

print("DISTRICT SCORING COMPLETE")
print(f"Tier 1 (Immediate): {districts['tier'].str.contains('Tier 1').sum()} districts")
print(f"Tier 2 (Nurture):   {districts['tier'].str.contains('Tier 2').sum()} districts")
print()
print("TOP 10 PRIORITY DISTRICTS:")
print(districts.nlargest(10, "partnership_readiness_score")[
    ["district_name","county","enrollment_k8","pct_ela_proficient",
     "sor_adoption_signal","partnership_readiness_score","tier"]
].to_string(index=False))


## 📊 Visualization

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(16, 6))
fig.suptitle("California District Prioritization Dashboard", fontsize=14, fontweight="bold")

# Matrix
scatter = axes[0].scatter(districts["pct_ela_proficient"],
                           districts["pd_budget_per_student_est"],
                           c=districts["partnership_readiness_score"],
                           cmap="RdYlGn", alpha=0.7,
                           s=districts["enrollment_k8"]/500)
axes[0].set_xlabel("ELA Proficiency % (lower = higher need)")
axes[0].set_ylabel("PD Budget per Student ($)")
axes[0].set_title("Need vs. Budget Matrix (bubble = enrollment)")
axes[0].invert_xaxis()
plt.colorbar(scatter, ax=axes[0], label="Readiness Score")

# Top 15
top15 = districts.nlargest(15, "partnership_readiness_score")
colors = [BRAND_COLORS["danger"] if "Tier 1" in t else BRAND_COLORS["accent"] for t in top15["tier"]]
axes[1].barh(top15["district_name"], top15["partnership_readiness_score"], color=colors, alpha=0.85)
axes[1].set_xlabel("Partnership Readiness Score")
axes[1].set_title("Top 15 Priority Districts")
axes[1].axvline(x=70, color="red", linestyle="--", alpha=0.5, label="Tier 1 threshold")
axes[1].legend()

plt.tight_layout()
plt.savefig("district_prioritization_matrix.png", dpi=150, bbox_inches="tight")
plt.show()


## 🤖 Step 3: ML Conversion Predictor

In [None]:
# Encode categoricals
le = LabelEncoder()
districts["sor_enc"] = le.fit_transform(districts["sor_adoption_signal"])
districts["county_enc"] = le.fit_transform(districts["county"])

features = ["enrollment_k8","pct_ela_proficient","pct_title1_students",
            "pd_budget_per_student_est","sor_enc","recent_literacy_initiative",
            "superintendent_tenure_yrs","teacher_turnover_rate","miles_from_la"]

X = districts[features].fillna(0)
# Simulated target — TODO: replace with real LP conversion data
districts["converted"] = (districts["partnership_readiness_score"] > 65).astype(int)
y = districts["converted"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
scaler = StandardScaler()
X_train_s = scaler.fit_transform(X_train)

model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train_s, y_train)
cv = cross_val_score(model, X_train_s, y_train, cv=5)
print(f"CV Accuracy: {cv.mean():.3f} +/- {cv.std():.3f}")

# Feature importance
fi = pd.DataFrame({"Feature": features, "Importance": model.feature_importances_})
print(fi.sort_values("Importance", ascending=False).to_string(index=False))


## 📤 Export

In [None]:
# Export for HubSpot import
export = districts[["district_name","county","enrollment_k8","pct_ela_proficient",
                     "sor_adoption_signal","pd_budget_per_student_est",
                     "partnership_readiness_score","tier"]]
export.sort_values("partnership_readiness_score", ascending=False).to_csv(
    "top_priority_districts.csv", index=False)
print(f"Exported {len(export)} districts to top_priority_districts.csv")
print(f"Tier 1 targets: {export['tier'].str.contains('Tier 1').sum()}")
