In [1]:
import pandas as pd
import os
from functools import reduce

#setup file import
schools_file = os.path.join("raw_data", "schools_complete.csv")
students_file = os.path.join("raw_data", "students_complete.csv")


# read files into dataframes
schools_df = pd.read_csv(schools_file)
students_df = pd.read_csv(students_file)

#Change the 'name' column of schools_df to 'school'.
# This will make merging and accessing the column more intuitive later,
# and keeps the dataset consistent throughout the entire analysis.
schools_df = schools_df.rename(columns={"name":"school"})

In [2]:
schools_df

Unnamed: 0,School ID,school,type,size,budget
0,0,Huang High School,District,2917,1910635
1,1,Figueroa High School,District,2949,1884411
2,2,Shelton High School,Charter,1761,1056600
3,3,Hernandez High School,District,4635,3022020
4,4,Griffin High School,Charter,1468,917500
5,5,Wilson High School,Charter,2283,1319574
6,6,Cabrera High School,Charter,1858,1081356
7,7,Bailey High School,District,4976,3124928
8,8,Holden High School,Charter,427,248087
9,9,Pena High School,Charter,962,585858


In [3]:
students_df.head()

Unnamed: 0,Student ID,name,gender,grade,school,reading_score,math_score
0,0,Paul Bradley,M,9th,Huang High School,66,79
1,1,Victor Smith,M,12th,Huang High School,94,61
2,2,Kevin Rodriguez,M,12th,Huang High School,90,60
3,3,Dr. Richard Scott,M,12th,Huang High School,67,58
4,4,Bonnie Ray,F,9th,Huang High School,97,84


# District Summary


In [4]:
'''
District Summary
Create high-level snapshot (in table form) of district's key metrics, including:
- total schools
- total students (formatted with thousands separators)
- total budget (formatted as currency)
- average math score 
- average reading score
- % passing math
- % passing reading
- overall passing rate (average of % passing math & % passing reading)
'''

#calculate total number of schools by counting school ids
total_schools = schools_df['School ID'].count()

#calculate total number of students by counting student ids
total_students = students_df['Student ID'].count()

#calculate total budget for each school
total_budget = schools_df['budget'].sum()

#calculate average math score for all students district-wide
avg_math_score = students_df['math_score'].mean()

#calculate average reading score for all students district-wide
avg_reading_score = students_df['reading_score'].mean()

#calculate number of students with passing math scores -- create data frame with passing scores,
# count number of students in dataframe, then calculate by dividing value by total_students
passing_math = students_df.loc[students_df["math_score"] >= 70, ["math_score"]]
pct_passing_math = (passing_math['math_score'].count() / total_students) * 100

#calculate number of students with passing read scores -- create data frame with passing scores,
# count number of students in dataframe, then calculate by dividing value by total_students
passing_reading = students_df.loc[students_df["reading_score"] >= 70, ["reading_score"]]
pct_passing_reading = (passing_reading['reading_score'].count() / total_students) * 100

# Average of % passing reading and % passing match
overall_passing_rate = ((pct_passing_math + pct_passing_reading) / 2)
overall_passing_rate

# create district_summary dataframe for display
district_summary = pd.DataFrame({
    "Total Schools": [total_schools],
    "Total Students": [total_students],
    "Total Budget": [total_budget],
    "Average Math Score": [avg_math_score],
    "Average Reading Score": [avg_reading_score],
    "% Passing Math": [pct_passing_math],
    "% Passing Reading": [pct_passing_reading],
    "Overall Passing Rate": [overall_passing_rate]})
district_summary = district_summary[["Total Schools", "Total Students", "Total Budget", "Average Math Score",
                                    "Average Reading Score", "% Passing Math", "% Passing Reading", 
                                    "Overall Passing Rate"]]
#work_df_us["avg pledge per project"] = work_df_us["avg pledge per project"].map("${:,.2f}".format)
district_summary["Total Students"] = district_summary["Total Students"].map("{:,}".format)
district_summary["Total Budget"] = district_summary["Total Budget"].map("${:,.2f}".format)
district_summary

Unnamed: 0,Total Schools,Total Students,Total Budget,Average Math Score,Average Reading Score,% Passing Math,% Passing Reading,Overall Passing Rate
0,15,39170,"$24,649,428.00",78.985371,81.87784,74.980853,85.805463,80.393158


# School Summary

In [5]:
school_name = schools_df['school']
school_name

0         Huang High School
1      Figueroa High School
2       Shelton High School
3     Hernandez High School
4       Griffin High School
5        Wilson High School
6       Cabrera High School
7        Bailey High School
8        Holden High School
9          Pena High School
10       Wright High School
11    Rodriguez High School
12      Johnson High School
13         Ford High School
14       Thomas High School
Name: school, dtype: object

In [6]:
school_type = schools_df['type']
school_type

0     District
1     District
2      Charter
3     District
4      Charter
5      Charter
6      Charter
7     District
8      Charter
9      Charter
10     Charter
11    District
12    District
13    District
14     Charter
Name: type, dtype: object

In [7]:
total_school_budget = schools_df['budget']
total_school_budget

0     1910635
1     1884411
2     1056600
3     3022020
4      917500
5     1319574
6     1081356
7     3124928
8      248087
9      585858
10    1049400
11    2547363
12    3094650
13    1763916
14    1043130
Name: budget, dtype: int64

In [8]:
total_students_per_school = schools_df['size']
total_students_per_school

0     2917
1     2949
2     1761
3     4635
4     1468
5     2283
6     1858
7     4976
8      427
9      962
10    1800
11    3999
12    4761
13    2739
14    1635
Name: size, dtype: int64

In [9]:
per_student_budget = schools_df['budget'] / schools_df['size']
per_student_budget

0     655.0
1     639.0
2     600.0
3     652.0
4     625.0
5     578.0
6     582.0
7     628.0
8     581.0
9     609.0
10    583.0
11    637.0
12    650.0
13    644.0
14    638.0
dtype: float64

In [10]:
# create dataframe containing only students who pass math
students_passing_math = students_df.loc[students_df['math_score'] > 70]
students_passing_math.head()

Unnamed: 0,Student ID,name,gender,grade,school,reading_score,math_score
0,0,Paul Bradley,M,9th,Huang High School,66,79
4,4,Bonnie Ray,F,9th,Huang High School,97,84
5,5,Bryan Miranda,M,9th,Huang High School,94,94
6,6,Sheena Carter,F,11th,Huang High School,82,80
8,8,Michael Roth,M,10th,Huang High School,95,87


In [27]:
# reduce dataframe to include only high school name and a new field that will be used by the groupby for the 
# student counter. For now, it is initialized to zero.
students_passing_math_red = pd.DataFrame({"school": students_passing_math["school"],
                                         "nbr_students_passing_math": 0})
#place school field in leftmost position for easier visualization by hoo-mans
students_passing_math_red = students_passing_math_red[["school","nbr_students_passing_math"]]
students_passing_math_red.head()


Unnamed: 0,school,nbr_students_passing_math
0,Huang High School,0
4,Huang High School,0
5,Huang High School,0
6,Huang High School,0
8,Huang High School,0


In [29]:
# group students passing math by high school and count. The count of students in the students_passing_math_red 
# dataset will go in the nbr_students_passing_math column.
students_passing_math_by_school = students_passing_math_red.groupby(['school']).count().reset_index()
students_passing_math_by_school

Unnamed: 0,school,nbr_students_passing_math
0,Bailey High School,3216
1,Cabrera High School,1664
2,Figueroa High School,1880
3,Ford High School,1801
4,Griffin High School,1317
5,Hernandez High School,3001
6,Holden High School,387
7,Huang High School,1847
8,Johnson High School,3040
9,Pena High School,882


In [15]:
# create dataframe containing only students who pass reading
students_passing_reading = students_df.loc[students_df['reading_score'] > 70]
students_passing_reading.head()

Unnamed: 0,Student ID,name,gender,grade,school,reading_score,math_score
1,1,Victor Smith,M,12th,Huang High School,94,61
2,2,Kevin Rodriguez,M,12th,Huang High School,90,60
4,4,Bonnie Ray,F,9th,Huang High School,97,84
5,5,Bryan Miranda,M,9th,Huang High School,94,94
6,6,Sheena Carter,F,11th,Huang High School,82,80


In [31]:
# reduce dataframe to include only high school and a new, empty field to contain the student counter created
# by the groupby in the next step.
students_passing_reading_red = pd.DataFrame({"school": students_passing_reading["school"],
                                          "nbr_students_passing_reading": 0})
# move school to leftmost position to make it easier for hoo-mans to visualize
students_passing_reading_red = students_passing_reading_red[["school","nbr_students_passing_reading"]]
students_passing_reading_red.head()

Unnamed: 0,school,nbr_students_passing_reading
1,Huang High School,0
2,Huang High School,0
4,Huang High School,0
5,Huang High School,0
6,Huang High School,0


In [32]:
# group students passing reading by high school and count
students_passing_reading_by_school = students_passing_reading_red.groupby(['school']).count().reset_index()
students_passing_reading_by_school

Unnamed: 0,school,nbr_students_passing_reading
0,Bailey High School,3946
1,Cabrera High School,1744
2,Figueroa High School,2313
3,Ford High School,2123
4,Griffin High School,1371
5,Hernandez High School,3624
6,Holden High School,396
7,Huang High School,2299
8,Johnson High School,3727
9,Pena High School,887


In [33]:
#group students by school to calculate average math and reading scores per school
school_group = students_df.groupby(['school'])

In [37]:

avg_scores_df = pd.DataFrame({"avg_math_score": school_group['math_score'].mean(),
                                 "avg_reading_score": school_group['reading_score'].mean()}).reset_index()
avg_scores_df



Unnamed: 0,school,avg_math_score,avg_reading_score
0,Bailey High School,77.048432,81.033963
1,Cabrera High School,83.061895,83.97578
2,Figueroa High School,76.711767,81.15802
3,Ford High School,77.102592,80.746258
4,Griffin High School,83.351499,83.816757
5,Hernandez High School,77.289752,80.934412
6,Holden High School,83.803279,83.814988
7,Huang High School,76.629414,81.182722
8,Johnson High School,77.072464,80.966394
9,Pena High School,83.839917,84.044699


In [38]:
# create 'super table' of schools_df, students passing reading, students passing math and average math scores.
# we'll pull columns from this table to build the summary.
# normally, merge works on two dataframes, but we'll use a little dot-notation magic
super_school_df = pd.merge(schools_df, avg_scores_df, on = 'school') \
.merge(students_passing_math_by_school, on = 'school') \
.merge(students_passing_reading_by_school, on = 'school') 
                                                                          
super_school_df

Unnamed: 0,School ID,school,type,size,budget,avg_math_score,avg_reading_score,nbr_students_passing_math,nbr_students_passing_reading
0,0,Huang High School,District,2917,1910635,76.629414,81.182722,1847,2299
1,1,Figueroa High School,District,2949,1884411,76.711767,81.15802,1880,2313
2,2,Shelton High School,Charter,1761,1056600,83.359455,83.725724,1583,1631
3,3,Hernandez High School,District,4635,3022020,77.289752,80.934412,3001,3624
4,4,Griffin High School,Charter,1468,917500,83.351499,83.816757,1317,1371
5,5,Wilson High School,Charter,2283,1319574,83.274201,83.989488,2076,2129
6,6,Cabrera High School,Charter,1858,1081356,83.061895,83.97578,1664,1744
7,7,Bailey High School,District,4976,3124928,77.048432,81.033963,3216,3946
8,8,Holden High School,Charter,427,248087,83.803279,83.814988,387,396
9,9,Pena High School,Charter,962,585858,83.839917,84.044699,882,887


In [48]:
#calculate percentage of students passing math
pct_students_passing_math_school = (super_school_df['nbr_students_passing_math'] / super_school_df['size']) * 100
pct_students_passing_reading_school = (super_school_df['nbr_students_passing_reading'] / \
                                      super_school_df['size']) * 100
overall_passing_rate_school = ((pct_students_passing_math_school + pct_students_passing_reading_school) / 2)
overall_passing_rate_school

0     71.066164
1     71.091896
2     91.254969
3     71.467098
4     91.553134
5     92.093736
6     91.711518
7     71.965434
8     91.686183
9     91.943867
10    91.861111
11    70.905226
12    71.067003
13    71.631982
14    91.559633
dtype: float64

In [54]:
# create district_summary dataframe for display
school_summary = pd.DataFrame({'School Name': super_school_df['school'],
                               'School Type': super_school_df['type'],
                               'Total Students': super_school_df['size'],
                               'Total School Budget': super_school_df['budget'],
                               'Per Student Budget': (super_school_df['budget'] / super_school_df['size']),
                               'Average Math Score': super_school_df['avg_math_score'],
                               'Average Reading Score': super_school_df['avg_reading_score'],
                               '% Passing Math': pct_students_passing_math_school,
                               '% Passing Reading': pct_students_passing_reading_school,
                               'Overall Passing Rate': overall_passing_rate_school})

school_summary['Total School Budget'] = school_summary['Total School Budget'].map('${:,.2f}'.format)
school_summary['Per Student Budget'] = school_summary['Per Student Budget'].map('${:,.2f}'.format)

school_summary = school_summary[['School Name', 'School Type', 'Total Students', 'Total School Budget',
                                 'Per Student Budget', 'Average Math Score', 'Average Reading Score',
                                 '% Passing Math', '% Passing Reading', 'Overall Passing Rate']] \
                 .set_index('School Name').rename_axis(None)
               
school_summary

Unnamed: 0,School Type,Total Students,Total School Budget,Per Student Budget,Average Math Score,Average Reading Score,% Passing Math,% Passing Reading,Overall Passing Rate
Huang High School,District,2917,"$1,910,635.00",$655.00,76.629414,81.182722,63.318478,78.81385,71.066164
Figueroa High School,District,2949,"$1,884,411.00",$639.00,76.711767,81.15802,63.750424,78.433367,71.091896
Shelton High School,Charter,1761,"$1,056,600.00",$600.00,83.359455,83.725724,89.892107,92.617831,91.254969
Hernandez High School,District,4635,"$3,022,020.00",$652.00,77.289752,80.934412,64.746494,78.187702,71.467098
Griffin High School,Charter,1468,"$917,500.00",$625.00,83.351499,83.816757,89.713896,93.392371,91.553134
Wilson High School,Charter,2283,"$1,319,574.00",$578.00,83.274201,83.989488,90.932983,93.25449,92.093736
Cabrera High School,Charter,1858,"$1,081,356.00",$582.00,83.061895,83.97578,89.558665,93.86437,91.711518
Bailey High School,District,4976,"$3,124,928.00",$628.00,77.048432,81.033963,64.630225,79.300643,71.965434
Holden High School,Charter,427,"$248,087.00",$581.00,83.803279,83.814988,90.632319,92.740047,91.686183
Pena High School,Charter,962,"$585,858.00",$609.00,83.839917,84.044699,91.683992,92.203742,91.943867
