In [7]:
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

def nan_report(df):
    print(df.isnull().sum() / len(df))

In [8]:
# Copy and pasted information from https://wveis.k12.wv.us/ses/StatSum/Trans.cfm

transportation_columns = ['County', 'Regular Miles', 'Vocational Miles', 
                          'Extra-Curricular Miles', 'Curricular Miles', 
                          'Total Miles', 'Students Transported']

transportation_df = pd.read_excel('../data/raw/SY13-14_Miles_Traveled_By_Regular_Buses.xlsx',
                                 names = transportation_columns)

transportation_df['County'] = transportation_df.County.str.title()
transportation_df.set_index('County', inplace = True)
print(transportation_df.info())
nan_report(transportation_df)
#print(transportation_df.isnull().sum() / len(transportation_df))
print(transportation_df.head())
transportation_tidy_df = transportation_df.reset_index().melt(id_vars = ['County'])

transportation_df.to_pickle('../data/transportation.pkl')
transportation_tidy_df.to_pickle('../data/transportation_tidy.pkl')
transportation_tidy_df.to_csv('../data/transportation_tidy.csv')

<class 'pandas.core.frame.DataFrame'>
Index: 56 entries, Barbour to School For Deaf And Blind
Data columns (total 6 columns):
Regular Miles             56 non-null int64
Vocational Miles          56 non-null int64
Extra-Curricular Miles    56 non-null int64
Curricular Miles          56 non-null int64
Total Miles               56 non-null int64
Students Transported      56 non-null int64
dtypes: int64(6)
memory usage: 3.1+ KB
None
Regular Miles             0.0
Vocational Miles          0.0
Extra-Curricular Miles    0.0
Curricular Miles          0.0
Total Miles               0.0
Students Transported      0.0
dtype: float64
          Regular Miles  Vocational Miles  Extra-Curricular Miles  \
County                                                              
Barbour          347227             10246                   14354   
Berkeley        2394406             42507                   78164   
Boone            697774             32587                   46303   
Braxton          284505   

In [13]:
# Had to copy and paste tables from https://wveis.k12.wv.us/ses/StatSum/enroll10_p_r.cfm
# in to an Excel spreadsheet

prom_retention_df = pd.read_excel('../data/raw/SY13-14_Promotion_Retention_By_Grade.xlsx', header = [0, 1])

prom_retention_df.index = prom_retention_df.index.str.title()
#prom_retention_df.info()
nan_report(prom_retention_df)


prom_retention_df.to_pickle('../data/prom_retention.pkl')
prom_retention_df.head()

                County   
Kindergarten    Promotion    0.0
                Retention    0.0
                Total        0.0
First Grade     Promotion    0.0
                Retention    0.0
                Total        0.0
Second Grade    Promotion    0.0
                Retention    0.0
                Total        0.0
Third Grade     Promotion    0.0
                Retention    0.0
                Total        0.0
Fourth Grade    Promotion    0.0
                Retention    0.0
                Total        0.0
Fifth Grade     Promotion    0.0
                Retention    0.0
                Total        0.0
Sixth Grade     Promotion    0.0
                Retention    0.0
                Total        0.0
Seventh Grade   Promotion    0.0
                Retention    0.0
                Total        0.0
Eighth Grade    Promotion    0.0
                Retention    0.0
                Total        0.0
Ninth Grade     Promotion    0.0
                Retention    0.0
                T

Unnamed: 0_level_0,Kindergarten,Kindergarten,Kindergarten,First Grade,First Grade,First Grade,Second Grade,Second Grade,Second Grade,Third Grade,...,Ninth Grade,Tenth Grade,Tenth Grade,Tenth Grade,Eleventh Grade,Eleventh Grade,Eleventh Grade,Twelfth Grade,Twelfth Grade,Twelfth Grade
County,Promotion,Retention,Total,Promotion,Retention,Total,Promotion,Retention,Total,Promotion,...,Total,Promotion,Retention,Total,Promotion,Retention,Total,Promotion,Retention,Total
Barbour,189,2,191,194,0,194,183,2,185,181,...,185,174,0,174,154,0,154,161,12,173
Berkeley,1347,52,1399,1430,37,1467,1377,10,1387,1360,...,1426,1280,0,1280,1022,1,1023,1039,49,1088
Boone,309,30,339,339,8,347,299,2,301,326,...,341,302,11,313,279,0,279,259,0,259
Braxton,168,10,178,157,3,160,155,4,159,150,...,173,156,3,159,126,4,130,121,9,130
Brooke,226,15,241,224,2,226,224,2,226,236,...,270,241,8,249,220,9,229,220,0,220


In [10]:
# Convert DataFrame to tidy version
prom_retention_tidy_df = prom_retention_df.stack().reset_index()\
                            .rename(columns={'level_0': 'County', 'County': 'Type'})\
                            .melt(id_vars = ['County', 'Type'], 
                                  var_name = 'Grade Level', value_name = 'Students')
        
# Remove total field (can be recreated as needed from the promotion and retention values)
prom_retention_tidy_df = prom_retention_tidy_df[prom_retention_tidy_df.Type != 'Total']

# Organize the grade levels as factor levels and order them by grade level
prom_retention_tidy_df['Grade Level'] = prom_retention_tidy_df['Grade Level'].astype('category')

grade_level_categories = ['Kindergarten', 'First Grade', 'Second Grade', 'Third Grade',
                         'Fourth Grade', 'Fifth Grade', 'Sixth Grade', 'Seventh Grade',
                         'Eighth Grade', 'Ninth Grade', 'Tenth Grade', 'Eleventh Grade',
                         'Twelfth Grade']

prom_retention_tidy_df['Grade Level'].cat.reorder_categories(grade_level_categories, inplace = True)

# Sort by County and Grade Level
prom_retention_tidy_df = prom_retention_tidy_df.sort_values(by = ['County', 'Grade Level']).reset_index(drop = True)

# Write tidy DataFrame to file
prom_retention_tidy_df.to_pickle('../data/prom_retention_tidy.pkl')
prom_retention_tidy_df.to_csv('../data/prom_retention_tidy.csv')

In [14]:
prom_retention_tidy_df.pivot_table(index = ['County', 'Type'], columns = ['Grade Level'], values = 'Students').head(10)

Unnamed: 0_level_0,Grade Level,Kindergarten,First Grade,Second Grade,Third Grade,Fourth Grade,Fifth Grade,Sixth Grade,Seventh Grade,Eighth Grade,Ninth Grade,Tenth Grade,Eleventh Grade,Twelfth Grade
County,Type,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
Barbour,Promotion,189,194,183,181,174,157,187,167,166,185,174,154,161
Barbour,Retention,2,0,2,0,0,0,1,0,1,0,0,0,12
Berkeley,Promotion,1347,1430,1377,1360,1377,1341,1399,1377,1343,1426,1280,1022,1039
Berkeley,Retention,52,37,10,3,15,2,8,17,15,0,0,1,49
Boone,Promotion,309,339,299,326,331,349,314,353,332,325,302,279,259
Boone,Retention,30,8,2,2,3,3,0,0,0,16,11,0,0
Braxton,Promotion,168,157,155,150,149,151,171,132,152,168,156,126,121
Braxton,Retention,10,3,4,2,2,1,0,0,0,5,3,4,9
Brooke,Promotion,226,224,224,236,227,236,216,230,257,251,241,220,220
Brooke,Retention,15,2,2,1,0,0,0,0,0,19,8,9,0


In [30]:
qual_teacher_df = pd.read_csv('../data/raw/SY15-16_Percent of Classes Taught by Highly Qualified Teachers by District.csv', 
                              header = 0, skiprows = [1], usecols = [1, 2, 3], thousands = ',')
qual_teacher_df.info()
nan_report(qual_teacher_df)
qual_teacher_df['County'] = qual_teacher_df.County.str.title().str.strip()
qual_teacher_df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 55 entries, 0 to 54
Data columns (total 3 columns):
County                55 non-null object
HQT Taught Classes    55 non-null int64
Total Courses         55 non-null int64
dtypes: int64(2), object(1)
memory usage: 1.4+ KB
County                0.0
HQT Taught Classes    0.0
Total Courses         0.0
dtype: float64


Unnamed: 0,County,HQT Taught Classes,Total Courses
0,Barbour,607,664
1,Berkeley,3476,4022
2,Boone,872,975
3,Braxton,439,515
4,Brooke,713,825


In [34]:
qual_teacher_tidy_df = qual_teacher_df.melt(id_vars= 'County')

qual_teacher_df.to_pickle('../data/qual_teacher.pkl')
qual_teacher_tidy_df.to_pickle('../data/qual_teacher_tidy.pkl')
qual_teacher_tidy_df.to_csv('../data/qual_teacher_tidy.csv')




In [None]:
df = pd.read_excel('../data/SY17-18_SchoolComposition_PublicRelease_v2.xlsx')
df.head()

In [None]:
#df2 = df.groupby('District').sum().reset_index()
df2 = df.groupby('District').sum()
#df2[df2.District != 'State'].plot(kind = 'bar')
df2.head()
#df2.plot(kind = 'bar', x = df2.District, y = df2.Total)
#df2.plot.bar(x = df2.District, y = df2.Total)
df2.loc[df2.index != 'State','Total'].plot.bar()

In [None]:
df.Male.unique()

In [51]:
school_enrollment = pd.read_excel('../data/SY16-17_SchoolComposition_HistoricalReport_2017.xlsx', 
                                  na_values = '<10').fillna(0)
school_enrollment.rename(columns = {'District': 'County'}, inplace = True)

#print(school_enrollment.info())
school_enrollment.head()

Unnamed: 0,School Code,County,School,PK,0K,01,02,03,04,05,...,Hispanic,Asian,Native,Multi-Racial,Pacific Islander,Male,Female,Special Ed.,Low SES,ELL
0,2101,Barbour,Kasson Elementary/Middle School,16.0,25.0,20.0,21.0,21.0,19.0,16.0,...,0.0,0.0,0.0,0.0,0.0,105,84.0,21.0,189.0,0.0
1,2201,Barbour,Belington Elementary,36.0,56.0,60.0,55.0,73.0,59.0,0.0,...,0.0,0.0,0.0,0.0,0.0,164,175.0,55.0,339.0,0.0
2,2202,Barbour,Junior Elementary,18.0,10.0,12.0,9.0,11.0,18.0,0.0,...,0.0,0.0,0.0,0.0,0.0,37,41.0,13.0,78.0,0.0
3,2203,Barbour,Mount Vernon Elementary,0.0,8.0,9.0,11.0,10.0,11.0,0.0,...,0.0,0.0,0.0,0.0,0.0,21,28.0,10.0,49.0,0.0
4,2204,Barbour,Philippi Elementary School,65.0,61.0,53.0,63.0,64.0,70.0,0.0,...,0.0,0.0,0.0,12.0,0.0,196,180.0,72.0,376.0,0.0


In [54]:
county_information = pd.read_excel('../data/SY16-17_Enrollment by County & Grade.xls', 
                                   skiprows = 6, 
                                   dtype = {'County Number': object},
                                   usecols = [0, 1])
#print(county_information.index.values.title())
print(county_information.info())
county_information.rename(columns = {'County Name': 'County', 'County Number': 'County Code'}, inplace = True)
county_information['County'] = county_information['County'].str.title()
county_information = county_information.set_index('County')
county_information.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 55 entries, 0 to 54
Data columns (total 2 columns):
County Number    55 non-null object
County Name      55 non-null object
dtypes: object(2)
memory usage: 960.0+ bytes
None


Unnamed: 0_level_0,County Code
County,Unnamed: 1_level_1
Barbour,2
Berkeley,4
Boone,6
Braxton,8
Brooke,10


In [55]:
school_enrollment.groupby('County').sum().head()

Unnamed: 0_level_0,PK,0K,01,02,03,04,05,06,07,08,...,Black,Hispanic,Asian,Native,Multi-Racial,Pacific Islander,Female,Special Ed.,Low SES,ELL
County,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Barbour,135.0,167.0,164.0,165.0,184.0,185.0,172.0,179.0,173.0,151.0,...,0.0,0.0,0.0,12.0,47.0,0.0,1117.0,430.0,2324.0,0.0
Berkeley,879.0,1359.0,1372.0,1379.0,1443.0,1443.0,1421.0,1407.0,1398.0,1418.0,...,1663.0,1250.0,68.0,0.0,1299.0,0.0,9327.0,3188.0,10815.0,425.0
Boone,193.0,289.0,285.0,317.0,271.0,318.0,287.0,312.0,316.0,354.0,...,10.0,0.0,0.0,0.0,0.0,0.0,1961.0,713.0,4178.0,0.0
Braxton,126.0,149.0,141.0,163.0,154.0,146.0,149.0,144.0,144.0,152.0,...,0.0,0.0,0.0,0.0,0.0,0.0,977.0,350.0,2039.0,0.0


In [126]:
assessment_information = pd.read_excel('../data/SY16-17_AssessmentProficiencySummary_AllGroups.xlsx', 
                                       header = 1, 
                                       dtype = {'District': object}).fillna(0)
columns_dict = {'District.1': 'County Name', 'District': 'County Code'}
assessment_information.rename(columns = columns_dict, inplace = True)
assessment_information = assessment_information.set_index(['County Code', 'County Name'])
assessment_information.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 715 entries, (002, Barbour) to (098, Wyoming)
Data columns (total 19 columns):
Group         715 non-null object
Population    715 non-null object
03            715 non-null float64
04            715 non-null float64
05            715 non-null float64
06            715 non-null float64
07            715 non-null float64
08            715 non-null float64
11            715 non-null float64
03.1          715 non-null float64
04.1          715 non-null float64
05.1          715 non-null float64
06.1          715 non-null float64
07.1          715 non-null float64
08.1          715 non-null float64
11.1          715 non-null float64
05.2          715 non-null float64
08.2          715 non-null float64
10            715 non-null float64
dtypes: float64(17), object(2)
memory usage: 108.5+ KB


In [127]:
def rename_subject_columns(df, subject_columns, prepend):
    rename_dict = {(column_name) : (prepend + column_name.split('.')[0] \
                                if column_name in subject_columns \
                                else column_name) \
               for column_name in df.columns}
    df.rename(columns = rename_dict, inplace = True)


math_columns = ['03', '04', '05', '06', '07', '08', '11']
#math_rename = {(column_name) : ('Math_' + column_name.split('.')[0] \
#                                if column_name in math_columns \
#                                else column_name) \
#               for column_name in assessment_information.columns}
#assessment_information.rename(columns = math_rename, inplace = True)
reading_columns = ['03.1', '04.1', '05.1', '06.1', '07.1', '08.1', '11.1']
#reading_rename = {(column_name) : ('Reading_' + column_name.split('.')[0] \
#                                if column_name in reading_columns \
#                                else column_name) \
#               for column_name in assessment_information.columns}
#assessment_information.rename(columns = reading_rename, inplace = True)
science_columns = ['05.2', '08.2', '10']
#science_rename = {(column_name) : ('Science_' + column_name.split('.')[0] \
#                                if column_name in science_columns \
#                                else column_name) \
#               for column_name in assessment_information.columns}
#assessment_information.rename(columns = science_rename, inplace = True)

rename_subject_columns(assessment_information, math_columns, 'Math_')
rename_subject_columns(assessment_information, reading_columns, 'Reading_')
rename_subject_columns(assessment_information, science_columns, 'Science_')

assessment_information.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 715 entries, (002, Barbour) to (098, Wyoming)
Data columns (total 19 columns):
Group         715 non-null object
Population    715 non-null object
Math_03       715 non-null float64
Math_04       715 non-null float64
Math_05       715 non-null float64
Math_06       715 non-null float64
Math_07       715 non-null float64
Math_08       715 non-null float64
Math_11       715 non-null float64
Reading_03    715 non-null float64
Reading_04    715 non-null float64
Reading_05    715 non-null float64
Reading_06    715 non-null float64
Reading_07    715 non-null float64
Reading_08    715 non-null float64
Reading_11    715 non-null float64
Science_05    715 non-null float64
Science_08    715 non-null float64
Science_10    715 non-null float64
dtypes: float64(17), object(2)
memory usage: 108.5+ KB


In [81]:
assessment_information.columns

Index(['District', 'County Name', 'Group', 'Population', '03', '04', '05',
       '06', '07', '08', '11', '03.1', '04.1', '05.1', '06.1', '07.1', '08.1',
       '11.1', '05.2', '08.2', '10'],
      dtype='object')

In [99]:
'Reading_03.1'.split('.')[0]

'Reading_03'

In [142]:
assessment_information_melted = pd.melt(assessment_information.reset_index(), id_vars = ['County Code', 'County Name', 'Group', 'Population'], 
                                        var_name = "Subject_Grade", 
                                        value_name = "% Proficient")
assessment_information_melted.head(25)
#assessment_information_melted.groupby(['Population']).mean()

Unnamed: 0,County Code,County Name,Group,Population,Subject_Grade,% Proficient
0,2,Barbour,Race/Ethnicity,Asian,Math_03,0.0
1,2,Barbour,Race/Ethnicity,Black or African American,Math_03,0.0
2,2,Barbour,Race/Ethnicity,Hispanic or Latino,Math_03,0.0
3,2,Barbour,Race/Ethnicity,Multi-racial,Math_03,0.333333
4,2,Barbour,Race/Ethnicity,Native American,Math_03,0.5
5,2,Barbour,Race/Ethnicity,Pacific Islander,Math_03,0.0
6,2,Barbour,Race/Ethnicity,White,Math_03,0.287356
7,2,Barbour,Gender,Female,Math_03,0.244186
8,2,Barbour,Gender,Male,Math_03,0.323232
9,2,Barbour,Student Status,English Language Learner,Math_03,0.0
