In [2]:
import pandas as pd
import numpy as np

input_path = "ID 814 SCH - Chronic Absenteeism.csv"

df = pd.read_csv(input_path, low_memory=False)
print("Raw ID814 shape:", df.shape)

# Rename IDs for consistency
df = df.rename(columns={
    "NCESLEAID": "LEAID",
    "SCHOOL_NAME": "SCH_NAME"
})

# Keep only what we need
keep_cols = [
    "LEAID", "SCHID", "SCH_NAME",
    "TOTAL_STUDENTS_REPORTED_M", "TOTAL_STUDENTS_REPORTED_F",
    "H_M", "H_F"
]
df = df[keep_cols].copy()

# Convert to numeric; EDFacts-style missing often uses negative codes (-8, -9)
for c in ["TOTAL_STUDENTS_REPORTED_M", "TOTAL_STUDENTS_REPORTED_F", "H_M", "H_F"]:
    df[c] = pd.to_numeric(df[c], errors="coerce")
    df.loc[df[c] < 0, c] = np.nan

# Compute school-level enrolled + chronically absent (all students)
df["enrolled"] = df["TOTAL_STUDENTS_REPORTED_M"] + df["TOTAL_STUDENTS_REPORTED_F"]
df["chron_absent"] = df["H_M"] + df["H_F"]

# Keep valid schools
df = df.dropna(subset=["LEAID", "SCHID", "enrolled", "chron_absent"])
df = df[(df["enrolled"] > 0) & (df["chron_absent"] >= 0) & (df["chron_absent"] <= df["enrolled"])]

# School chronic absenteeism rate (%)
df["absent_rate"] = (df["chron_absent"] / df["enrolled"]) * 100

print("Clean school rows:", df.shape)

# --- Aggregate schools -> LEA (district) ---
df["weighted_absent"] = df["absent_rate"] * df["enrolled"]

lea = (
    df.groupby("LEAID", as_index=False)
      .agg(
          total_enrollment=("enrolled", "sum"),
          num_schools_reporting=("SCHID", "nunique"),
          mean_school_absent_rate=("absent_rate", "mean"),
          weighted_absent_sum=("weighted_absent", "sum")
      )
)

lea["weighted_absent_rate"] = lea["weighted_absent_sum"] / lea["total_enrollment"]
lea = lea.drop(columns=["weighted_absent_sum"])

print("Final LEA shape:", lea.shape)
print(lea.head())

# Save
output_path = "crdc_lea_absenteeism_2017_18.csv"
lea.to_csv(output_path, index=False)
print(f"Saved {output_path}")


Raw ID814 shape: (92437, 32)
Clean school rows: (46563, 10)
Final LEA shape: (8567, 5)
    LEAID  total_enrollment  num_schools_reporting  mean_school_absent_rate  \
0  100005             532.0                      4                 3.379832   
1  100006             924.0                     12                 8.865099   
2  100007             434.0                      2                 3.591439   
3  100008             429.0                      4                 6.051376   
4  100011              61.0                      1                 8.196721   

   weighted_absent_rate  
0              2.819549  
1              8.982684  
2              2.995392  
3              5.128205  
4              8.196721  
Saved crdc_lea_absenteeism_2017_18.csv
