In [1]:
#Dependencies and setup
import pandas as pd

#Reference the desired CSV file
school_data_to_load = "Resources/schools_complete.csv"
student_data_to_load = "Resources/students_complete.csv"

#Store Homework data files into Pandas Data Frames
school_data = pd.read_csv(school_data_to_load)
student_data = pd.read_csv(student_data_to_load)

#Combine the 2 data files into a single dataset
school_data_df = pd.merge(student_data, school_data, how="left", on=["school_name", "school_name"])
school_data_df.head()

Unnamed: 0,Student ID,student_name,gender,grade,school_name,reading_score,math_score,School ID,type,size,budget
0,0,Paul Bradley,M,9th,Huang High School,66,79,0,District,2917,1910635
1,1,Victor Smith,M,12th,Huang High School,94,61,0,District,2917,1910635
2,2,Kevin Rodriguez,M,12th,Huang High School,90,60,0,District,2917,1910635
3,3,Dr. Richard Scott,M,12th,Huang High School,67,58,0,District,2917,1910635
4,4,Bonnie Ray,F,9th,Huang High School,97,84,0,District,2917,1910635


In [2]:
#1. District Summary

#Calculate the total number of schools

#Check if there's any missing data
school_data_df.count()

#Number of unique schools
school_count = len(school_data_df["school_name"].unique())
#school_count

#Calculate de Number of Students
students_count = len(school_data_df["Student ID"].unique())
students_count

#Calculate Total Budget
school_budget = school_data_df["budget"].unique()
total_budget = school_budget.sum()
total_budget

#Making sure to have float along columns of reading score and math score
school_data_df["reading_score"] = school_data_df["reading_score"].astype(float)
school_data_df[ "math_score"] = school_data_df["math_score"].astype(float)

#Calculate average math and reading score
av_math_score = school_data_df["math_score"].mean()
av_math_score

av_read_score = school_data_df["reading_score"].mean()
av_read_score


#Calculate overall passing rate
average_score = av_read_score + av_math_score
pass_rate = average_score/2

#Calculating percentages of passing reading score
read = ["reading_score"]
pass_reading = len(school_data_df.loc[school_data_df["reading_score"] >= 70, read])
read_pass_score = pass_reading * 100 / students_count
read_pass_score

#Calculating percentages of passing math score
math = ["math_score"]
pass_math = len(school_data_df.loc[school_data_df["math_score"] >= 70, math])
math_pass_score = pass_math*100 / students_count
math_pass_score

#Creating and formatting District Summary Table
district_summary_df = pd.DataFrame({"Total Schools":[school_count],"Total Students":[students_count],"Total Budget":[total_budget], 
                                   "Average Math Score":[av_math_score], "Average Reading Score":[av_read_score], "% Passing Math"
                                   :[math_pass_score], "% Passing Reading":[read_pass_score], "% Overall Passing Rate":[pass_rate]})
district_summary_df["Total Budget"] = district_summary_df["Total Budget"].map("${:,.2f}".format)
district_summary_df


Unnamed: 0,Total Schools,Total Students,Total Budget,Average Math Score,Average Reading Score,% Passing Math,% Passing Reading,% Overall Passing Rate
0,15,39170,"$24,649,428.00",78.985371,81.87784,74.980853,85.805463,80.431606


In [3]:
#2. School Summary: Create an overview table that summarizes key metrics about each school, including:
# School Name and Type, Total Students, Total School Budget, Per Student Budget, Average Math Score and Reading Score, % Passing Math
#and Reading, Overall Passing Rate (Average of the above two)

#Create a dataframe to hold the above results

#Grouping the original data frame

all_schools_df = school_data_df.groupby(["school_name"])



In [4]:
#School type
school_type = all_schools_df["type"].first()
school_type

#Total budget per school
school_budget = all_schools_df["budget"].first()
school_budget

#Total students per school
total_students = all_schools_df["Student ID"].count()
total_students

#Per Student Budget
student_budget = school_budget/total_students
student_budget

#Average Math Score
av_math = all_schools_df["math_score"].mean()
av_math

#Average Reading Score
av_read = all_schools_df["reading_score"].mean()

#% Passing Math, % Passing Reading, % Overall Passing Rate (Average of the previous two)
school_pass_math = school_data_df.loc[school_data_df["math_score"]>=70,:].groupby(["school_name"])["Student ID"].count()/total_students*100
school_pass_read = school_data_df.loc[school_data_df["reading_score"]>=70,:].groupby(["school_name"])["Student ID"].count()/total_students*100
overall_passing = (school_pass_math + school_pass_read)/2
overall_passing

School_summary_df = pd.DataFrame({"School Type":school_type,"Total Students":total_students,"Total School Budget":school_budget, 
                               "Per Student Budget":student_budget,"Average Math Score":av_math,
                               "Average Reading Score":av_read,"% Passing Math":school_pass_math,
                               "% Passing Reading":school_pass_read,"% Overall Passing Rate":overall_passing})
School_summary_df.index.name = None
School_summary_df["Total School Budget"] = School_summary_df["Total School Budget"].map("${:,.2f}".format)
School_summary_df["Per Student Budget"] = School_summary_df["Per Student Budget"].map("${:,.2f}".format)
School_summary_df.head()

Unnamed: 0,School Type,Total Students,Total School Budget,Per Student Budget,Average Math Score,Average Reading Score,% Passing Math,% Passing Reading,% Overall Passing Rate
Bailey High School,District,4976,"$3,124,928.00",$628.00,77.048432,81.033963,66.680064,81.93328,74.306672
Cabrera High School,Charter,1858,"$1,081,356.00",$582.00,83.061895,83.97578,94.133477,97.039828,95.586652
Figueroa High School,District,2949,"$1,884,411.00",$639.00,76.711767,81.15802,65.988471,80.739234,73.363852
Ford High School,District,2739,"$1,763,916.00",$644.00,77.102592,80.746258,68.309602,79.299014,73.804308
Griffin High School,Charter,1468,"$917,500.00",$625.00,83.351499,83.816757,93.392371,97.138965,95.265668


In [5]:
#Sort and display the top five schools in overall passing rate
Schools_TopPassRate_df = School_summary_df.sort_values("% Overall Passing Rate", ascending=False)
Schools_TopPassRate_df.head()

Unnamed: 0,School Type,Total Students,Total School Budget,Per Student Budget,Average Math Score,Average Reading Score,% Passing Math,% Passing Reading,% Overall Passing Rate
Cabrera High School,Charter,1858,"$1,081,356.00",$582.00,83.061895,83.97578,94.133477,97.039828,95.586652
Thomas High School,Charter,1635,"$1,043,130.00",$638.00,83.418349,83.84893,93.272171,97.308869,95.29052
Pena High School,Charter,962,"$585,858.00",$609.00,83.839917,84.044699,94.594595,95.945946,95.27027
Griffin High School,Charter,1468,"$917,500.00",$625.00,83.351499,83.816757,93.392371,97.138965,95.265668
Wilson High School,Charter,2283,"$1,319,574.00",$578.00,83.274201,83.989488,93.867718,96.539641,95.203679


In [6]:
#Sort and display the five worst-performing schools
Schools_WorstPassRate_df = School_summary_df.sort_values("% Overall Passing Rate")
Schools_WorstPassRate_df.head()

Unnamed: 0,School Type,Total Students,Total School Budget,Per Student Budget,Average Math Score,Average Reading Score,% Passing Math,% Passing Reading,% Overall Passing Rate
Rodriguez High School,District,3999,"$2,547,363.00",$637.00,76.842711,80.744686,66.366592,80.220055,73.293323
Figueroa High School,District,2949,"$1,884,411.00",$639.00,76.711767,81.15802,65.988471,80.739234,73.363852
Huang High School,District,2917,"$1,910,635.00",$655.00,76.629414,81.182722,65.683922,81.316421,73.500171
Johnson High School,District,4761,"$3,094,650.00",$650.00,77.072464,80.966394,66.057551,81.222432,73.639992
Ford High School,District,2739,"$1,763,916.00",$644.00,77.102592,80.746258,68.309602,79.299014,73.804308


In [18]:
#Math Scores by Grade
#Create a table that lists the average Reading Score for students of each grade level (9th, 10th, 11th, 12th) at each school.
#Create a pandas series for each grade. Hint: use a conditional statement.
#Group each series by school
#Combine the series into a dataframe
#Optional: give the displayed data cleaner formatting

#all_schools_df = school_data_df.groupby(["school_name"])
#all_schools_df.count().head()
school_data_df.head()


Unnamed: 0,Student ID,student_name,gender,grade,school_name,reading_score,math_score,School ID,type,size,budget
0,0,Paul Bradley,M,9th,Huang High School,66.0,79.0,0,District,2917,1910635
1,1,Victor Smith,M,12th,Huang High School,94.0,61.0,0,District,2917,1910635
2,2,Kevin Rodriguez,M,12th,Huang High School,90.0,60.0,0,District,2917,1910635
3,3,Dr. Richard Scott,M,12th,Huang High School,67.0,58.0,0,District,2917,1910635
4,4,Bonnie Ray,F,9th,Huang High School,97.0,84.0,0,District,2917,1910635


In [29]:
value = ["grade"] 
school_data_df.loc[school_data_df["grade"] == "9th", "reading score"]


KeyError: 'reading score'