In [1]:
# Import dependencies
import pandas as pd
import numpy as np


In [2]:
# Read schools file
df_schools = pd.read_csv('./datasource/schools_complete.csv')
df_schools.head() 

Unnamed: 0,School ID,school_name,type,size,budget
0,0,Huang High School,District,2917,1910635
1,1,Figueroa High School,District,2949,1884411
2,2,Shelton High School,Charter,1761,1056600
3,3,Hernandez High School,District,4635,3022020
4,4,Griffin High School,Charter,1468,917500


In [3]:
# Summarize Schools data.
dic = {'School ID': 'nunique', 'budget':'sum'}
sum_schools = pd.DataFrame(df_schools.agg(dic)).T
sum_schools.rename(columns={"School ID":"Total Schools", "budget":"Total Budget"}, inplace=True)
sum_schools["ID"] = 1
sum_schools


Unnamed: 0,Total Schools,Total Budget,ID
0,15,24649428,1


In [4]:
# Head students file
df_students = pd.read_csv('./datasource/students_complete.csv')

# Add columns for pass math and reading
df_students['pass_math'] = df_students['math_score'].apply(lambda score: 1 if score >=70 else 0)
df_students['pass_read'] = df_students['reading_score'].apply(lambda score: 1 if score >=70 else 0)

df_students.head()

Unnamed: 0,Student ID,student_name,gender,grade,school_name,reading_score,math_score,pass_math,pass_read
0,0,Paul Bradley,M,9th,Huang High School,66,79,1,0
1,1,Victor Smith,M,12th,Huang High School,94,61,0,1
2,2,Kevin Rodriguez,M,12th,Huang High School,90,60,0,1
3,3,Dr. Richard Scott,M,12th,Huang High School,67,58,0,0
4,4,Bonnie Ray,F,9th,Huang High School,97,84,1,1


In [5]:
# Summarize Students data.
dic = {"Student ID": "nunique", "math_score":"mean",  'reading_score':'mean',  'pass_math':'sum',  'pass_read':'sum'}
sum_students = pd.DataFrame(df_students.agg(dic)).T
sum_students.rename(columns={"Student ID":"Total Students", "math_score":"Average Math Score",
                             "reading_score":"Average Reading Score"}, inplace=True) 
sum_students


Unnamed: 0,Total Students,Average Math Score,Average Reading Score,pass_math,pass_read
0,39170.0,78.985371,81.87784,29370.0,33610.0


In [6]:
# Compute Students Percentages
sum_students["% Passing Math"] = sum_students["pass_math"] / sum_students['Total Students'] * 100
sum_students["% Passing Reading"] =  sum_students["pass_read"] / sum_students['Total Students'] * 100
# Really??? Should be "% Passing Math" + "% Passing Reading", is not it?
sum_students["% Overall Passing Rate"] = (sum_students["Average Math Score"] + sum_students["Average Reading Score"]) / 2
sum_students["ID"] = 1
sum_students

Unnamed: 0,Total Students,Average Math Score,Average Reading Score,pass_math,pass_read,% Passing Math,% Passing Reading,% Overall Passing Rate,ID
0,39170.0,78.985371,81.87784,29370.0,33610.0,74.980853,85.805463,80.431606,1


In [7]:
# Merge Schools and Students summaries 
total_summary = pd.merge(sum_schools , sum_students, on="ID", how="inner")
# Format before print
total_summary["Total Budget"] = total_summary["Total Budget"].astype(float).map("${:,.2f}".format)
total_summary["Total Students"] = total_summary["Total Students"].astype(int).map("{:,.0f}".format)
total_summary["% Passing Math"] = total_summary["% Passing Math"].astype(float).map("{:.2f}%".format)
total_summary["% Passing Reading"] = total_summary["% Passing Reading"].astype(float).map("{:.2f}%".format)
total_summary["% Overall Passing Rate"] = total_summary["% Overall Passing Rate"].astype(float).map("{:.2f}%".format)
total_summary.drop(['ID','pass_math','pass_read'], axis=1, inplace=True)

# District Summary

In [8]:
total_summary

Unnamed: 0,Total Schools,Total Budget,Total Students,Average Math Score,Average Reading Score,% Passing Math,% Passing Reading,% Overall Passing Rate
0,15,"$24,649,428.00",39170,78.985371,81.87784,74.98%,85.81%,80.43%


# Starting School Summary

In [9]:
# Merging main tables
main_tbl = pd.merge(df_schools, df_students, how="left", on="school_name")
main_tbl.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 39170 entries, 0 to 39169
Data columns (total 13 columns):
School ID        39170 non-null int64
school_name      39170 non-null object
type             39170 non-null object
size             39170 non-null int64
budget           39170 non-null int64
Student ID       39170 non-null int64
student_name     39170 non-null object
gender           39170 non-null object
grade            39170 non-null object
reading_score    39170 non-null int64
math_score       39170 non-null int64
pass_math        39170 non-null int64
pass_read        39170 non-null int64
dtypes: int64(8), object(5)
memory usage: 4.2+ MB


In [10]:
# Add columns for pass math and reading
main_tbl.head()

Unnamed: 0,School ID,school_name,type,size,budget,Student ID,student_name,gender,grade,reading_score,math_score,pass_math,pass_read
0,0,Huang High School,District,2917,1910635,0,Paul Bradley,M,9th,66,79,1,0
1,0,Huang High School,District,2917,1910635,1,Victor Smith,M,12th,94,61,0,1
2,0,Huang High School,District,2917,1910635,2,Kevin Rodriguez,M,12th,90,60,0,1
3,0,Huang High School,District,2917,1910635,3,Dr. Richard Scott,M,12th,67,58,0,0
4,0,Huang High School,District,2917,1910635,4,Bonnie Ray,F,9th,97,84,1,1


In [20]:


aggregation = {"Student ID": "count", "math_score":"mean",  'reading_score':'mean',"pass_math":"sum","pass_read":"sum"}
renamecol =   {"Student ID": "Total Students", "math_score":"Average Math Score",  'reading_score':'Average Reading Score',
               "school_name":"School Name","type":"School Type","budget":"Total School Budget"}
agg_by_school = main_tbl.groupby(["School ID","school_name","type","budget"],axis=0).agg(aggregation)
agg_by_school.reset_index(inplace=True)
agg_by_school.set_index("School ID",inplace=True)
agg_by_school.rename(columns=renamecol,inplace=True)
agg_by_school["Per Student Budget"] = agg_by_school['Total School Budget']/agg_by_school['Total Students']
agg_by_school["% Passing Math"] = agg_by_school['pass_math']/agg_by_school['Total Students'] * 100
agg_by_school["% Passing Reading"] = agg_by_school['pass_read']/agg_by_school['Total Students'] * 100
agg_by_school["% Overall Passing Rate"] = (agg_by_school['% Passing Math'] + agg_by_school['% Passing Reading']) / 2 

# Remove index from "School ID", it will be dropped.
agg_by_school.reset_index(inplace=True)
# Drop unnecessary columns.
agg_by_school.drop(['pass_math','pass_read', 'School ID'], axis = 1, inplace = True)
# Set the index to "School Name".
agg_by_school.set_index("School Name",inplace=True)
agg_by_school.head()


Unnamed: 0_level_0,School Type,Total School Budget,Total Students,Average Math Score,Average Reading Score,Per Student Budget,% Passing Math,% Passing Reading,% Overall Passing Rate
School Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Huang High School,District,1910635,2917,76.629414,81.182722,655.0,65.683922,81.316421,73.500171
Figueroa High School,District,1884411,2949,76.711767,81.15802,639.0,65.988471,80.739234,73.363852
Shelton High School,Charter,1056600,1761,83.359455,83.725724,600.0,93.867121,95.854628,94.860875
Hernandez High School,District,3022020,4635,77.289752,80.934412,652.0,66.752967,80.862999,73.807983
Griffin High School,Charter,917500,1468,83.351499,83.816757,625.0,93.392371,97.138965,95.265668


# Top Performing Schools (By Passing Rate)

In [17]:
agg_by_school.sort_values("% Overall Passing Rate",ascending=False).head(5).set_index(drop[]'')

Unnamed: 0,School ID,School Name,School Type,Total School Budget,Total Students,Average Math Score,Average Reading Score,pass_math,pass_read,Per Student Budget,% Passing Math,% Passing Reading,% Overall Passing Rate
0,6,Cabrera High School,Charter,1081356,1858,83.061895,83.97578,1749,1803,582.0,94.133477,97.039828,95.586652
1,14,Thomas High School,Charter,1043130,1635,83.418349,83.84893,1525,1591,638.0,93.272171,97.308869,95.29052
2,9,Pena High School,Charter,585858,962,83.839917,84.044699,910,923,609.0,94.594595,95.945946,95.27027
3,4,Griffin High School,Charter,917500,1468,83.351499,83.816757,1371,1426,625.0,93.392371,97.138965,95.265668
4,5,Wilson High School,Charter,1319574,2283,83.274201,83.989488,2143,2204,578.0,93.867718,96.539641,95.203679
