In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Set style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

# Load Millburn data
df = pd.read_excel('../raw_data/16618 - Millburn Absence History 7.1.20 to 6.30.25.xlsx')

print(f'Original data shape: {df.shape}')

# Rule 1: Filter out records where Filled = 'Unfilled' AND Needs Substitute = 'NO'
df = df[~((df['Filled'] == 'Unfilled') & (df['Needs Substitute'] == 'NO'))]
print(f'After Unfilled/NO filter: {df.shape}')

# Rule 2: Include only Employee Type = 'Teacher', 'Teacher Music', 'Teacher SpecEd'
df = df[df['Employee Type'].isin(['Teacher', 'Teacher Music', 'Teacher SpecEd'])]
print(f'After Employee Type filter: {df.shape}')

df.head()



Original data shape: (94538, 13)
After Unfilled/NO filter: (51127, 13)
After Employee Type filter: (39948, 13)


Unnamed: 0,School Year,Date,Reason,Employee Identifier,Hire Date,Employee Title,Employee Type,Start Time,End Time,Duration,Absence Type,Filled,Needs Substitute
8508,2021-2022,2021-10-25,Self,4057,2015-08-13 18:45:38,Grade 3 teacher,Teacher,08:20:00,15:50:00,7.5,Full Day,Unfilled,YES
8509,2021-2022,2021-11-17,Self,4057,2015-08-13 18:45:38,Grade 3 teacher,Teacher,08:20:00,15:50:00,7.5,Full Day,Unfilled,YES
8510,2021-2022,2021-11-24,Self,4057,2015-08-13 18:45:38,Grade 3 teacher,Teacher,08:20:00,15:50:00,7.5,Full Day,Unfilled,YES
8511,2021-2022,2021-12-15,Self,4057,2015-08-13 18:45:38,Grade 3 teacher,Teacher,11:40:00,15:50:00,4.166666,PM Half Day,Filled,YES
8512,2021-2022,2021-12-15,Self,4057,2015-08-13 18:45:38,Grade 3 teacher,Teacher,08:20:00,11:40:00,3.333333,AM Half Day,Unfilled,YES


In [2]:
# ============================================================================
# EXPLORATORY DATA ANALYSIS - MILLBURN
# ============================================================================

print("="*80)
print("MILLBURN DATA EXPLORATION")
print("="*80)
print(f"\nDataset Shape: {df.shape[0]:,} rows × {df.shape[1]} columns")
print(f"\nColumn Names:")
for i, col in enumerate(df.columns, 1):
    print(f"  {i}. {col}")

print(f"\nData Types:")
print(df.dtypes)

print(f"\nMissing Values:")
missing = df.isnull().sum()
missing_pct = (missing / len(df) * 100).round(2)
missing_df = pd.DataFrame({
    'Missing Count': missing,
    'Missing %': missing_pct
})
missing_df = missing_df[missing_df['Missing Count'] > 0].sort_values('Missing Count', ascending=False)
if len(missing_df) > 0:
    print(missing_df)
else:
    print("  No missing values!")

print(f"\nFirst few rows:")
df.head(10)

MILLBURN DATA EXPLORATION

Dataset Shape: 39,948 rows × 13 columns

Column Names:
  1. School Year
  2. Date
  3. Reason
  4. Employee Identifier
  5. Hire Date
  6. Employee Title
  7. Employee Type
  8. Start Time
  9. End Time
  10. Duration
  11. Absence Type
  12. Filled
  13. Needs Substitute

Data Types:
School Year                    object
Date                   datetime64[ns]
Reason                         object
Employee Identifier            object
Hire Date              datetime64[ns]
Employee Title                 object
Employee Type                  object
Start Time                     object
End Time                       object
Duration                      float64
Absence Type                   object
Filled                         object
Needs Substitute               object
dtype: object

Missing Values:
                Missing Count  Missing %
Hire Date               16388      41.02
Employee Title            154       0.39

First few rows:


Unnamed: 0,School Year,Date,Reason,Employee Identifier,Hire Date,Employee Title,Employee Type,Start Time,End Time,Duration,Absence Type,Filled,Needs Substitute
8508,2021-2022,2021-10-25,Self,4057,2015-08-13 18:45:38,Grade 3 teacher,Teacher,08:20:00,15:50:00,7.5,Full Day,Unfilled,YES
8509,2021-2022,2021-11-17,Self,4057,2015-08-13 18:45:38,Grade 3 teacher,Teacher,08:20:00,15:50:00,7.5,Full Day,Unfilled,YES
8510,2021-2022,2021-11-24,Self,4057,2015-08-13 18:45:38,Grade 3 teacher,Teacher,08:20:00,15:50:00,7.5,Full Day,Unfilled,YES
8511,2021-2022,2021-12-15,Self,4057,2015-08-13 18:45:38,Grade 3 teacher,Teacher,11:40:00,15:50:00,4.166666,PM Half Day,Filled,YES
8512,2021-2022,2021-12-15,Self,4057,2015-08-13 18:45:38,Grade 3 teacher,Teacher,08:20:00,11:40:00,3.333333,AM Half Day,Unfilled,YES
8513,2021-2022,2021-12-21,Self,4057,2015-08-13 18:45:38,Grade 3 teacher,Teacher,08:20:00,15:50:00,7.5,Full Day,Unfilled,YES
8514,2021-2022,2021-12-22,Self,4057,2015-08-13 18:45:38,Grade 3 teacher,Teacher,08:20:00,15:50:00,7.5,Full Day,Filled,YES
8515,2021-2022,2021-12-23,Self,4057,2015-08-13 18:45:38,Grade 3 teacher,Teacher,08:20:00,15:50:00,7.5,Full Day,Filled,YES
8516,2021-2022,2022-01-28,Self,4057,2015-08-13 18:45:38,Grade 3 teacher,Teacher,08:20:00,15:50:00,7.5,Full Day,Unfilled,YES
8517,2021-2022,2022-02-11,Personal,4057,2015-08-13 18:45:38,Grade 3 teacher,Teacher,08:20:00,15:50:00,7.5,Full Day,Filled,YES


In [3]:
# ============================================================================
# DATA OVERVIEW - KEY STATISTICS
# ============================================================================

print("="*80)
print("KEY STATISTICS")
print("="*80)

print(f"\nDate Range:")
print(f"  Start Date: {df['Date'].min()}")
print(f"  End Date: {df['Date'].max()}")
print(f"  Total Days Covered: {(df['Date'].max() - df['Date'].min()).days} days")

print(f"\nSchool Years in Dataset:")
school_years = sorted(df['School Year'].unique())
for year in school_years:
    count = len(df[df['School Year'] == year])
    print(f"  {year}: {count:,} records")

print(f"\nTotal Unique Employees: {df['Employee Identifier'].nunique():,}")
print(f"Total Absence Records: {len(df):,}")

print(f"\nAbsence Type Distribution:")
absence_type_dist = df['Absence Type'].value_counts()
for abs_type, count in absence_type_dist.items():
    pct = (count / len(df) * 100)
    print(f"  {abs_type}: {count:,} ({pct:.1f}%)")

print(f"\nFilled Status Distribution:")
filled_dist = df['Filled'].value_counts()
for status, count in filled_dist.items():
    pct = (count / len(df) * 100)
    print(f"  {status}: {count:,} ({pct:.1f}%)")

print(f"\nEmployee Type Distribution:")
emp_type_dist = df['Employee Type'].value_counts()
for emp_type, count in emp_type_dist.items():
    pct = (count / len(df) * 100)
    print(f"  {emp_type}: {count:,} ({pct:.1f}%)")

KEY STATISTICS

Date Range:
  Start Date: 2020-09-01 00:00:00
  End Date: 2025-06-26 00:00:00
  Total Days Covered: 1759 days

School Years in Dataset:
  2020-2021: 6,153 records
  2021-2022: 8,730 records
  2022-2023: 9,343 records
  2023-2024: 8,148 records
  2024-2025: 7,574 records

Total Unique Employees: 527
Total Absence Records: 39,948

Absence Type Distribution:
  Full Day: 34,983 (87.6%)
  PM Half Day: 2,827 (7.1%)
  AM Half Day: 1,898 (4.8%)
  Custom Duration: 240 (0.6%)

Filled Status Distribution:
  Filled: 29,942 (75.0%)
  Unfilled: 10,006 (25.0%)

Employee Type Distribution:
  Teacher: 35,712 (89.4%)
  Teacher SpecEd: 3,657 (9.2%)
  Teacher Music: 579 (1.4%)


In [4]:
# ============================================================================
# ABSENCE DAYS ANALYSIS
# ============================================================================

# Calculate absence days if not already done
if 'Absence_Days' not in df.columns:
    def calculate_absence_days(row):
        if row['Absence Type'] == 'Full Day':
            return 1.0
        elif row['Absence Type'] in ['AM Half Day', 'PM Half Day']:
            return 0.5
        elif row['Absence Type'] == 'Custom Duration':
            hours = pd.to_numeric(row['Duration'], errors='coerce')
            if pd.isna(hours):
                return 0
            return hours / 7.5
        else:
            return 0
    df['Absence_Days'] = df.apply(calculate_absence_days, axis=1)

print("="*80)
print("ABSENCE DAYS SUMMARY")
print("="*80)

print(f"\nTotal Absence Days: {df['Absence_Days'].sum():,.2f}")
print(f"Average Absence Days per Record: {df['Absence_Days'].mean():.2f}")
print(f"Median Absence Days per Record: {df['Absence_Days'].median():.2f}")

print(f"\nAbsence Days by Type:")
absence_days_by_type = df.groupby('Absence Type')['Absence_Days'].agg(['sum', 'mean', 'count'])
absence_days_by_type.columns = ['Total Days', 'Avg Days per Record', 'Count']
print(absence_days_by_type.sort_values('Total Days', ascending=False))

print(f"\nAbsence Days by School Year:")
absence_days_by_year = df.groupby('School Year')['Absence_Days'].agg(['sum', 'mean', 'count'])
absence_days_by_year.columns = ['Total Days', 'Avg Days per Record', 'Count']
print(absence_days_by_year)

ABSENCE DAYS SUMMARY

Total Absence Days: 37,530.64
Average Absence Days per Record: 0.94
Median Absence Days per Record: 1.00

Absence Days by Type:
                   Total Days  Avg Days per Record  Count
Absence Type                                             
Full Day         34983.000000             1.000000  34983
PM Half Day       1413.500000             0.500000   2827
AM Half Day        949.000000             0.500000   1898
Custom Duration    185.135546             0.771398    240

Absence Days by School Year:
              Total Days  Avg Days per Record  Count
School Year                                         
2020-2021    6065.533333             0.985785   6153
2021-2022    8261.713331             0.946359   8730
2022-2023    8753.111110             0.936863   9343
2023-2024    7510.922222             0.921812   8148
2024-2025    6939.355550             0.916207   7574


In [None]:
# ============================================================================
# TEACHER-LEVEL ANALYSIS
# ============================================================================

# Calculate total absence days per teacher per school year
teacher_absence_days = df.groupby(['School Year', 'Employee Identifier'])['Absence_Days'].sum().reset_index()
teacher_absence_days.columns = ['School Year', 'Employee Identifier', 'Total_Days']

print("="*80)
print("TEACHER ABSENCE PATTERNS")
print("="*80)

print(f"\nTotal Unique Teachers: {teacher_absence_days['Employee Identifier'].nunique():,}")

print(f"\nOverall Statistics (All Years Combined):")
all_teacher_days = teacher_absence_days.groupby('Employee Identifier')['Total_Days'].sum()
print(f"  Mean Absence Days per Teacher: {all_teacher_days.mean():.2f}")
print(f"  Median Absence Days per Teacher: {all_teacher_days.median():.2f}")
print(f"  Min Absence Days: {all_teacher_days.min():.2f}")
print(f"  Max Absence Days: {all_teacher_days.max():.2f}")
print(f"  Std Dev: {all_teacher_days.std():.2f}")

print(f"\nTeachers by Absence Category (All Years Combined):")
print(f"  Teachers with 0-10 days: {len(all_teacher_days[(all_teacher_days >= 0) & (all_teacher_days <= 10)]):,}")
print(f"  Teachers with 11-20 days: {len(all_teacher_days[(all_teacher_days > 10) & (all_teacher_days <= 20)]):,}")
print(f"  Teachers with 21-30 days: {len(all_teacher_days[(all_teacher_days > 20) & (all_teacher_days <= 30)]):,}")
print(f"  Teachers with 31-40 days: {len(all_teacher_days[(all_teacher_days > 30) & (all_teacher_days <= 40)]):,}")
print(f"  Teachers with 41-50 days: {len(all_teacher_days[(all_teacher_days > 40) & (all_teacher_days <= 50)]):,}")
print(f"  Teachers with 51-60 days: {len(all_teacher_days[(all_teacher_days > 50) & (all_teacher_days <= 60)]):,}")
print(f"  Teachers with 61-70 days: {len(all_teacher_days[(all_teacher_days > 60) & (all_teacher_days <= 70)]):,}")
print(f"  Teachers with 71-80 days: {len(all_teacher_days[(all_teacher_days > 70) & (all_teacher_days <= 80)]):,}")
print(f"  Teachers with >80 days: {len(all_teacher_days[all_teacher_days > 80]):,}")

print(f"\nBy School Year:")
for year in sorted(teacher_absence_days['School Year'].unique()):
    year_data = teacher_absence_days[teacher_absence_days['School Year'] == year]
    print(f"\n  {year}:")
    print(f"    Total Teachers: {year_data['Employee Identifier'].nunique():,}")
    print(f"    Mean Days per Teacher: {year_data['Total_Days'].mean():.2f}")
    print(f"    Median Days per Teacher: {year_data['Total_Days'].median():.2f}")
    print(f"    Max Days: {year_data['Total_Days'].max():.2f}")

In [None]:
# ============================================================================
# VISUALIZATION: ABSENCE DAYS DISTRIBUTION
# ============================================================================

# Overall distribution of absence days per teacher (all years combined)
all_teacher_days = teacher_absence_days.groupby('Employee Identifier')['Total_Days'].sum()

plt.figure(figsize=(14, 8))
plt.hist(all_teacher_days, bins=50, color='steelblue', alpha=0.7, edgecolor='black')
plt.title('Distribution of Total Absence Days per Teacher (All Years Combined)', 
          fontsize=16, fontweight='bold', pad=20)
plt.xlabel('Total Absence Days', fontsize=14, fontweight='bold')
plt.ylabel('Number of Teachers', fontsize=14, fontweight='bold')
plt.axvline(all_teacher_days.mean(), color='red', linestyle='--', linewidth=2, label=f'Mean: {all_teacher_days.mean():.1f}')
plt.axvline(all_teacher_days.median(), color='green', linestyle='--', linewidth=2, label=f'Median: {all_teacher_days.median():.1f}')
plt.legend(fontsize=12)
plt.grid(axis='y', alpha=0.3)
plt.tight_layout()
plt.show()

print(f"Summary Statistics:")
print(f"  Mean: {all_teacher_days.mean():.2f} days")
print(f"  Median: {all_teacher_days.median():.2f} days")
print(f"  Std Dev: {all_teacher_days.std():.2f} days")
print(f"  25th Percentile: {all_teacher_days.quantile(0.25):.2f} days")
print(f"  75th Percentile: {all_teacher_days.quantile(0.75):.2f} days")
print(f"  90th Percentile: {all_teacher_days.quantile(0.90):.2f} days")
print(f"  95th Percentile: {all_teacher_days.quantile(0.95):.2f} days")
print(f"  99th Percentile: {all_teacher_days.quantile(0.99):.2f} days")

In [None]:
# ============================================================================
# VISUALIZATION: ABSENCE TRENDS BY SCHOOL YEAR
# ============================================================================

# Absence days by school year
absence_by_year = df.groupby('School Year')['Absence_Days'].sum().reset_index()
absence_by_year = absence_by_year.sort_values('School Year')

# Number of teachers by year
teachers_by_year = df.groupby('School Year')['Employee Identifier'].nunique().reset_index()
teachers_by_year.columns = ['School Year', 'Total Teachers']
teachers_by_year = teachers_by_year.sort_values('School Year')

# Average absence days per teacher by year
avg_days_by_year = teacher_absence_days.groupby('School Year')['Total_Days'].mean().reset_index()
avg_days_by_year.columns = ['School Year', 'Avg Days per Teacher']
avg_days_by_year = avg_days_by_year.sort_values('School Year')

fig, axes = plt.subplots(2, 2, figsize=(16, 12))

# Plot 1: Total Absence Days by Year
axes[0, 0].bar(absence_by_year['School Year'], absence_by_year['Absence_Days'], 
               color='steelblue', alpha=0.8)
axes[0, 0].set_title('Total Absence Days by School Year', fontsize=14, fontweight='bold')
axes[0, 0].set_xlabel('School Year', fontsize=12)
axes[0, 0].set_ylabel('Total Absence Days', fontsize=12)
axes[0, 0].tick_params(axis='x', rotation=45)
axes[0, 0].grid(axis='y', alpha=0.3)
for i, v in enumerate(absence_by_year['Absence_Days']):
    axes[0, 0].text(i, v + 200, f'{int(v)}', ha='center', va='bottom', fontweight='bold')

# Plot 2: Number of Teachers by Year
axes[0, 1].bar(teachers_by_year['School Year'], teachers_by_year['Total Teachers'], 
               color='green', alpha=0.8)
axes[0, 1].set_title('Number of Teachers by School Year', fontsize=14, fontweight='bold')
axes[0, 1].set_xlabel('School Year', fontsize=12)
axes[0, 1].set_ylabel('Total Teachers', fontsize=12)
axes[0, 1].tick_params(axis='x', rotation=45)
axes[0, 1].grid(axis='y', alpha=0.3)
for i, v in enumerate(teachers_by_year['Total Teachers']):
    axes[0, 1].text(i, v + 5, f'{int(v)}', ha='center', va='bottom', fontweight='bold')

# Plot 3: Average Absence Days per Teacher by Year
axes[1, 0].plot(avg_days_by_year['School Year'], avg_days_by_year['Avg Days per Teacher'], 
                marker='o', linewidth=3, markersize=10, color='red')
axes[1, 0].fill_between(avg_days_by_year['School Year'], avg_days_by_year['Avg Days per Teacher'], 
                         alpha=0.3, color='red')
axes[1, 0].set_title('Average Absence Days per Teacher by School Year', fontsize=14, fontweight='bold')
axes[1, 0].set_xlabel('School Year', fontsize=12)
axes[1, 0].set_ylabel('Average Days per Teacher', fontsize=12)
axes[1, 0].tick_params(axis='x', rotation=45)
axes[1, 0].grid(True, alpha=0.3)
for i, v in enumerate(avg_days_by_year['Avg Days per Teacher']):
    axes[1, 0].text(i, v + 0.5, f'{v:.1f}', ha='center', va='bottom', fontweight='bold')

# Plot 4: Total Absence Records by Year
absences_by_year = df.groupby('School Year').size().reset_index()
absences_by_year.columns = ['School Year', 'Total Records']
absences_by_year = absences_by_year.sort_values('School Year')
axes[1, 1].bar(absences_by_year['School Year'], absences_by_year['Total Records'], 
              color='orange', alpha=0.8)
axes[1, 1].set_title('Total Absence Records by School Year', fontsize=14, fontweight='bold')
axes[1, 1].set_xlabel('School Year', fontsize=12)
axes[1, 1].set_ylabel('Total Records', fontsize=12)
axes[1, 1].tick_params(axis='x', rotation=45)
axes[1, 1].grid(axis='y', alpha=0.3)
for i, v in enumerate(absences_by_year['Total Records']):
    axes[1, 1].text(i, v + 100, f'{int(v)}', ha='center', va='bottom', fontweight='bold')

plt.tight_layout()
plt.show()

In [None]:
# ============================================================================
# ABSENCE TYPE AND FILLED STATUS ANALYSIS
# ============================================================================

print("="*80)
print("ABSENCE TYPE AND FILLED STATUS BREAKDOWN")
print("="*80)

# Absence Type by Filled Status
print("\nAbsence Type by Filled Status:")
absence_filled_cross = pd.crosstab(df['Absence Type'], df['Filled'], margins=True)
print(absence_filled_cross)

# Percentage filled by absence type
print("\nFill Rate by Absence Type:")
for abs_type in df['Absence Type'].unique():
    type_data = df[df['Absence Type'] == abs_type]
    filled_count = len(type_data[type_data['Filled'] == 'Filled'])
    total_count = len(type_data)
    fill_rate = (filled_count / total_count * 100) if total_count > 0 else 0
    print(f"  {abs_type}: {fill_rate:.1f}% ({filled_count:,}/{total_count:,})")

# Overall fill rate
overall_filled = len(df[df['Filled'] == 'Filled'])
overall_total = len(df)
overall_fill_rate = (overall_filled / overall_total * 100)
print(f"\nOverall Fill Rate: {overall_fill_rate:.1f}% ({overall_filled:,}/{overall_total:,})")

In [None]:
# ============================================================================
# MONTHLY ABSENCE PATTERNS
# ============================================================================

# Extract month and year
df['Year'] = df['Date'].dt.year
df['Month'] = df['Date'].dt.month
df['Month_Name'] = df['Date'].dt.strftime('%B')
df['Year_Month'] = df['Date'].dt.to_period('M')

# Monthly absence days
monthly_absences = df.groupby('Year_Month')['Absence_Days'].sum().reset_index()
monthly_absences['Year_Month_Str'] = monthly_absences['Year_Month'].astype(str)
monthly_absences = monthly_absences.sort_values('Year_Month')

print("="*80)
print("MONTHLY ABSENCE PATTERNS")
print("="*80)

print("\nTop 10 Months with Highest Absence Days:")
top_months = monthly_absences.nlargest(10, 'Absence_Days')
for idx, row in top_months.iterrows():
    print(f"  {row['Year_Month_Str']}: {row['Absence_Days']:,.2f} days")

print("\nBottom 10 Months with Lowest Absence Days:")
bottom_months = monthly_absences.nsmallest(10, 'Absence_Days')
for idx, row in bottom_months.iterrows():
    print(f"  {row['Year_Month_Str']}: {row['Absence_Days']:,.2f} days")

# Visualization
plt.figure(figsize=(16, 8))
plt.plot(monthly_absences['Year_Month_Str'], monthly_absences['Absence_Days'], 
         marker='o', linewidth=2, markersize=6, color='steelblue')
plt.fill_between(range(len(monthly_absences)), monthly_absences['Absence_Days'], alpha=0.3, color='steelblue')
plt.title('Monthly Absence Days Trend', fontsize=16, fontweight='bold', pad=20)
plt.xlabel('Year-Month', fontsize=14, fontweight='bold')
plt.ylabel('Total Absence Days', fontsize=14, fontweight='bold')
plt.xticks(rotation=90, fontsize=10)
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

In [None]:
# ============================================================================
# TOP ABSENT TEACHERS ANALYSIS
# ============================================================================

print("="*80)
print("TOP 20 TEACHERS BY TOTAL ABSENCE DAYS (All Years)")
print("="*80)

all_teacher_days_sorted = all_teacher_days.sort_values(ascending=False)
top_20_teachers = all_teacher_days_sorted.head(20)

print("\nRank | Employee ID | Total Days")
print("-" * 40)
for rank, (emp_id, days) in enumerate(top_20_teachers.items(), 1):
    print(f"{rank:4d} | {emp_id:11s} | {days:10.2f}")

# Visualization
plt.figure(figsize=(14, 8))
top_20_teachers.plot(kind='barh', color='steelblue', alpha=0.8)
plt.title('Top 20 Teachers by Total Absence Days (All Years)', fontsize=16, fontweight='bold', pad=20)
plt.xlabel('Total Absence Days', fontsize=14, fontweight='bold')
plt.ylabel('Employee ID', fontsize=14, fontweight='bold')
plt.gca().invert_yaxis()
plt.grid(axis='x', alpha=0.3)
for i, v in enumerate(top_20_teachers.values):
    plt.text(v + 0.5, i, f'{v:.1f}', va='center', fontweight='bold')
plt.tight_layout()
plt.show()

In [None]:
# ============================================================================
# DATA QUALITY CHECK
# ============================================================================

print("="*80)
print("DATA QUALITY ASSESSMENT")
print("="*80)

print("\n1. Date Range Check:")
print(f"   Earliest Date: {df['Date'].min()}")
print(f"   Latest Date: {df['Date'].max()}")
print(f"   Date Range: {(df['Date'].max() - df['Date'].min()).days} days")

print("\n2. School Year Consistency:")
for year in sorted(df['School Year'].unique()):
    year_data = df[df['School Year'] == year]
    date_range = f"{year_data['Date'].min().date()} to {year_data['Date'].max().date()}"
    print(f"   {year}: {len(year_data):,} records, Date Range: {date_range}")

print("\n3. Absence Days Validation:")
zero_days = len(df[df['Absence_Days'] == 0])
negative_days = len(df[df['Absence_Days'] < 0])
print(f"   Records with 0 absence days: {zero_days:,}")
print(f"   Records with negative days: {negative_days:,}")

print("\n4. Employee Identifier Check:")
duplicate_emps = df.groupby(['Employee Identifier', 'School Year']).size()
duplicate_emps = duplicate_emps[duplicate_emps > 1]
print(f"   Unique Employee IDs: {df['Employee Identifier'].nunique():,}")
print(f"   Total Records: {len(df):,}")

print("\n5. Absence Type Validation:")
print(f"   Unique Absence Types: {df['Absence Type'].nunique()}")
print(f"   Types: {', '.join(df['Absence Type'].unique())}")

print("\n✓ Data quality check complete!")