In [1]:
# Import dependencies
import pandas as pd
import numpy as np

In [2]:
# Read schools file
df_schools = pd.read_csv('./datasource/schools_complete.csv')
df_schools.head() 

Unnamed: 0,School ID,school_name,type,size,budget
0,0,Huang High School,District,2917,1910635
1,1,Figueroa High School,District,2949,1884411
2,2,Shelton High School,Charter,1761,1056600
3,3,Hernandez High School,District,4635,3022020
4,4,Griffin High School,Charter,1468,917500


In [3]:
# Summarize Schools data.
dic = {'School ID': 'nunique', 'type':'sum' , 'budget':'sum'}
sum_schools = pd.DataFrame(df_schools.agg(dic)).T

# Rename columns
rename_cols = {"School ID":"Total Schools", "sum":"Total Students", "budget":"Total Budget"}
sum_schools.rename(columns=rename_cols, inplace=True)
sum_schools

Unnamed: 0,Total Schools,Total Budget
0,15,24649428


In [4]:
# Head students file
df_students = pd.read_csv('./datasource/students_complete.csv')

# Add columns for pass math and reading
df_students['pass_math'] = df_students['math_score'].apply(lambda score: 1 if score >=70 else 0)
df_students['pass_read'] = df_students['reading_score'].apply(lambda score: 1 if score >=70 else 0)

df_students.head()

Unnamed: 0,Student ID,student_name,gender,grade,school_name,reading_score,math_score,pass_math,pass_read
0,0,Paul Bradley,M,9th,Huang High School,66,79,1,0
1,1,Victor Smith,M,12th,Huang High School,94,61,0,1
2,2,Kevin Rodriguez,M,12th,Huang High School,90,60,0,1
3,3,Dr. Richard Scott,M,12th,Huang High School,67,58,0,0
4,4,Bonnie Ray,F,9th,Huang High School,97,84,1,1


In [5]:
# Summarize Students data.
dic = {"Student ID": "nunique", "math_score":"mean",  'reading_score':'mean',  'pass_math':'sum',  'pass_read':'sum'}
sum_students = pd.DataFrame(df_students.agg(dic)).T
# Rename columns
rename_cols = {"Student ID":"Total Students", "math_score":"Average Math Score", "reading_score":"Average Reading Score"}
sum_students.rename(columns=rename_cols, inplace=True) 
sum_students

Unnamed: 0,Total Students,Average Math Score,Average Reading Score,pass_math,pass_read
0,39170.0,78.985371,81.87784,29370.0,33610.0


In [6]:
# Compute Students Percentages
sum_students["% Passing Math"] = sum_students["pass_math"] / sum_students['Total Students'] * 100
sum_students["% Passing Reading"] =  sum_students["pass_read"] / sum_students['Total Students'] * 100
# Really??? Should be "% Passing Math" + "% Passing Reading", is not it?
sum_students["% Overall Passing Rate"] = (sum_students["Average Math Score"] + sum_students["Average Reading Score"]) / 2
sum_students

Unnamed: 0,Total Students,Average Math Score,Average Reading Score,pass_math,pass_read,% Passing Math,% Passing Reading,% Overall Passing Rate
0,39170.0,78.985371,81.87784,29370.0,33610.0,74.980853,85.805463,80.431606


In [7]:
# Merge Schools and Students summaries 
total_summary = pd.merge(sum_schools , sum_students, on="Total Students" how="outer")

# Format before print
total_summary["Total Budget"] = total_summary["Total Budget"].astype(float).map("${:,.2f}".format)
total_summary["Total Students"] = total_summary["Total Students"].astype(int).map("{:,.0f}".format)
total_summary["% Passing Math"] = total_summary["% Passing Math"].astype(float).map("{:.2f}%".format)
total_summary["% Passing Reading"] = total_summary["% Passing Reading"].astype(float).map("{:.2f}%".format)
total_summary["% Overall Passing Rate"] = total_summary["% Overall Passing Rate"].astype(float).map("{:.2f}%".format)
# Drop unnecessary columns
total_summary.drop(['pass_math','pass_read'], axis=1, inplace=True)

MergeError: No common columns to perform merge on

# District Summary

In [None]:
total_summary

# Starting School Summary

In [None]:
# Merging main tables
main_tbl = pd.merge(df_schools, df_students, how="left", on="school_name")
main_tbl.info()

In [None]:
# Add columns for pass math and reading
main_tbl.head()

In [None]:
# Create an overview table that summarizes key metrics about each school
aggregation = {"Student ID": "count", "math_score":"mean",  'reading_score':'mean',"pass_math":"sum","pass_read":"sum"}
renamecol =   {"Student ID": "Total Students", "math_score":"Average Math Score",  'reading_score':'Average Reading Score', \
               "school_name":"School Name","type":"School Type","budget":"Total School Budget"}
agg_by_school = main_tbl.groupby(["School ID","school_name","type","budget"],axis=0).agg(aggregation)
agg_by_school.reset_index(inplace=True)
agg_by_school.set_index("School ID",inplace=True)
agg_by_school.rename(columns=renamecol,inplace=True)
agg_by_school["Per Student Budget"] = agg_by_school['Total School Budget']/agg_by_school['Total Students']
agg_by_school["% Passing Math"] = agg_by_school['pass_math']/agg_by_school['Total Students'] * 100
agg_by_school["% Passing Reading"] = agg_by_school['pass_read']/agg_by_school['Total Students'] * 100
agg_by_school["% Overall Passing Rate"] = (agg_by_school['% Passing Math'] + agg_by_school['% Passing Reading']) / 2 

# Format output.
agg_by_school["Total Students"] = agg_by_school["Total Students"].astype(float).map("{:,.0f}".format)
agg_by_school["Total School Budget"] = agg_by_school["Total School Budget"].astype(float).map("${:,.2f}".format)
agg_by_school["Per Student Budget"] = agg_by_school["Per Student Budget"].astype(float).map("${:,.2f}".format)
agg_by_school["% Passing Math"] = agg_by_school["% Passing Math"].astype(float).map("{:.2f}%".format)
agg_by_school["% Passing Reading"] = agg_by_school["% Passing Reading"].astype(float).map("{:.2f}%".format)
agg_by_school["% Overall Passing Rate"] = agg_by_school["% Overall Passing Rate"].astype(float).map("{:.3f}%".format)

# Remove index from "School ID", it will be dropped.
agg_by_school.reset_index(inplace=True)
# Drop unnecessary columns.
agg_by_school.drop(['pass_math','pass_read', 'School ID'], axis = 1, inplace = True)
# Set the index to "School Name".
agg_by_school.set_index("School Name",inplace=True)
# agg_by_school.head()


# Top Performing Schools (By Passing Rate)

In [None]:
agg_by_school.sort_values("% Overall Passing Rate",ascending=False).head(5)

# Bottom Performing Schools (By Passing Rate)

In [None]:
agg_by_school.sort_values("% Overall Passing Rate",ascending=True).head(5)

# Math Scores by Grade

In [None]:
# Create DataFrame for Math Score average Grouping By School ID and Grade  
grade_math_score = main_tbl.pivot_table(index=["school_name"], columns='grade', values='math_score', aggfunc='mean') 

# Sort Grade and rename titles name for presentation.
columns = ['9th', '10th','11th','12th']
grade_math_score = grade_math_score[columns]
grade_math_score.columns.name = 'Grade'
grade_math_score.index.name = 'School Name'
grade_math_score

# Reading Scores by Grade

In [None]:
# Create DataFrame for Reading Score average Grouping By School ID and Grade  
grade_reading_score = main_tbl.pivot_table(index=["school_name"], columns='grade', values='reading_score', aggfunc='mean') 

# Sort Grade and rename titles name for presentation.
columns = ['9th', '10th','11th','12th']
grade_reading_score = grade_reading_score[columns]
grade_reading_score.columns.name = 'Grade'
grade_reading_score.index.name = 'School Name'

grade_reading_score

In [None]:
main_tbl.head()