In [1]:
import pandas as pd
import numpy as np

In [8]:
cols = ['ncessch_num', 'grade', 'race', 'sex', 'enrollment']

def clean_df(year):
    df = pd.read_csv(f'Enrollment Data/schools_ccd_enrollment_{year}.csv', usecols=cols)
    
    keep_grades = [i for i in range(-1, 15)]
    df = df[df['grade'].isin(keep_grades)]
    
    keep_races = [i for i in range(1, 8)]
    df = df[df['race'].isin(keep_races)]
    
    races = {1:'White', 2:'Black', 3:'Hispanic', 4:'Asian', 5:'American Indian or Alaska Native', 
         6:'Native Hawaiian or other Pacific Islander', 7:'Two or more races'}
    df = df.replace({"race": races})
    
    df = df[df['enrollment'] > 0]
    enrollment_df = df.groupby('race')['enrollment'].sum().reset_index()
    enrollment_df['year'] = f'{year}-01-01'
    
    enrollment_df.to_csv(f'Cleaned Data/{year}_enrollment.csv', index=False)
    
    print(f'{year} done')
    

start_year = 1998
end_year = 2021 + 1
for i in range(start_year, end_year):
    clean_df(i)

1998 done
1999 done
2000 done
2001 done
2002 done
2003 done
2004 done
2005 done
2006 done
2007 done
2008 done
2009 done
2010 done
2011 done
2012 done
2013 done
2014 done
2015 done
2016 done
2017 done
2018 done
2019 done
2020 done
2021 done


---
### Original EDA Below

In [2]:
# columns: year, ncessch, ncessch_num, leaid, fips, grade, race, sex, enrollment
cols = ['ncessch_num', 'grade', 'race', 'sex', 'enrollment']

# 2010-2021
df = pd.read_csv('Enrollment Data/schools_ccd_enrollment_2021.csv', usecols=cols)

df.head()

Unnamed: 0,ncessch_num,grade,race,sex,enrollment
0,10000500870,7,5,2,1.0
1,10000500870,7,5,1,3.0
2,10000500870,7,4,2,2.0
3,10000500870,7,4,1,2.0
4,10000500870,7,2,2,7.0


In [3]:
sorted(df['grade'].unique())

[-1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 99, 999]

-1—Pre-K
0—Kindergarten
1—1
2—2
3—3
4—4
5—5
6—6
7—7
8—8
9—9
10—10
11—11
12—12
13—13
14—Adult education
15—Ungraded
99—Total
999—Not specified

In [4]:
# only keep -1 to 14
keep_grades = [i for i in range(-1, 15)]

df = df[df['grade'].isin(keep_grades)]

sorted(df['grade'].unique())

[-1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14]

1—White
2—Black
3—Hispanic
4—Asian
5—American Indian or Alaska Native
6—Native Hawaiian or other Pacific Islander
7—Two or more races
8—Nonresident alien
9—Unknown
20—Other
99—Total
-1—Missing/not reported
-2—Not applicable
-3—Suppressed data

In [5]:
sorted(df['race'].unique())

[1, 2, 3, 4, 5, 6, 7, 9, 99]

In [6]:
# only keep 1 to 7
keep_races = [i for i in range(1, 8)]

df = df[df['race'].isin(keep_races)]

sorted(df['race'].unique())

[1, 2, 3, 4, 5, 6, 7]

In [10]:
races = {1:'White', 2:'Black', 3:'Hispanic', 4:'Asian', 5:'American Indian or Alaska Native', 
         6:'Native Hawaiian or other Pacific Islander', 7:'Two or more races'}

df = df.replace({"race": races})
df

Unnamed: 0,ncessch_num,grade,race,sex,enrollment
0,10000500870,7,American Indian or Alaska Native,2,1.0
1,10000500870,7,American Indian or Alaska Native,1,3.0
2,10000500870,7,Asian,2,2.0
3,10000500870,7,Asian,1,2.0
4,10000500870,7,Black,2,7.0
...,...,...,...,...,...
18486019,780003000034,8,Hispanic,99,15.0
18486020,780003000034,8,Asian,99,0.0
18486021,780003000034,8,American Indian or Alaska Native,99,0.0
18486022,780003000034,8,Native Hawaiian or other Pacific Islander,99,0.0


In [11]:
# enrollment_df = df.groupby(['grade', 'race'])['enrollment'].sum().reset_index()

enrollment_df = df.groupby('race')['enrollment'].sum().reset_index()
enrollment_df['year'] = '2021-01-01'
enrollment_df

Unnamed: 0,race,enrollment,year
0,American Indian or Alaska Native,963584.0,2021-01-01
1,Asian,5299714.0,2021-01-01
2,Black,14635934.0,2021-01-01
3,Hispanic,28330290.0,2021-01-01
4,Native Hawaiian or other Pacific Islander,412186.0,2021-01-01
5,Two or more races,4633530.0,2021-01-01
6,White,44360226.0,2021-01-01


In [12]:
enrollment_df.to_csv('Cleaned Data/2021_enrollment.csv', index=False)