# Import dependencies and setup

In [1]:
#import dependencies
import pandas as pd

#create a reference to cvs schools file and import it into a pandas df
schools="Resources/schools_complete.csv"
schools_df=pd.read_csv(schools)
    #schools_df.head()

#create a reference to cvs students file and import it into a pandas df
students="Resources/students_complete.csv"
students_df=pd.read_csv(students)
    #students_df.head()

#create a unified df
city_df=pd.merge(schools_df, students_df, on="school_name")
city_df.head()

Unnamed: 0,School ID,school_name,type,size,budget,Student ID,student_name,gender,grade,reading_score,math_score
0,0,Huang High School,District,2917,1910635,0,Paul Bradley,M,9th,66,79
1,0,Huang High School,District,2917,1910635,1,Victor Smith,M,12th,94,61
2,0,Huang High School,District,2917,1910635,2,Kevin Rodriguez,M,12th,90,60
3,0,Huang High School,District,2917,1910635,3,Dr. Richard Scott,M,12th,67,58
4,0,Huang High School,District,2917,1910635,4,Bonnie Ray,F,9th,97,84


# 1. District Summary


In [2]:
#calculate total number of schools
tot_schools= city_df["school_name"].nunique()

#calculate total number of studens
tot_students= city_df["student_name"].count()

#calculate total budget
tot_budget= schools_df["budget"].sum()

#print results to check
#tot_schools, tot_students,tot_budget

In [3]:
#define function to calculate passing rate for each subject

def info(x, x_score):
    passing_x_df= city_df.loc[city_df[x_score] >= 70]
    passing_x= passing_x_df["student_name"].count()
    passing_x_perc= (passing_x/tot_students)*100
    return passing_x_perc
        

In [4]:
avg_math_score= city_df["math_score"].mean()
passing_math_perc=info("math", "math_score")

#print results to check
#avg_math_score, passing_math_perc

In [5]:
avg_read_score= city_df["reading_score"].mean()
passing_read_perc=info("read", "reading_score")

#print results to check
#avg_read_score, passing_read_perc

In [6]:
#Calculate the overall passing rate 
avg_score= (passing_math_perc+passing_read_perc)/2

#print results to check
#avg_score 

In [7]:
#district summary table

district_summary_df=pd.DataFrame({
    "Total schools" : [tot_schools],
    "Total students" : [tot_students],
    "Total budget" : [tot_budget],
    "Average math score" : avg_math_score,
    "Average reading score" : avg_read_score,
    "Passing math (%)" : passing_math_perc,
    "Passing reading (%)" : round(passing_read_perc,2),
    "Overall passing rate (%)" : round(avg_score,2)})

#district summary formatting
district_form_df=district_summary_df[["Total schools","Total students","Total budget","Average math score",
                                    "Average reading score", "Passing math (%)", "Passing reading (%)", 
                                      "Overall passing rate (%)"]]
district_form_df["Total students"]=district_form_df["Total students"].map("{:,}".format)
district_form_df["Total budget"]=district_form_df["Total budget"].map("${:,}".format)
district_form_df["Average math score"]=district_form_df["Average math score"].map("{:.2f}".format)
district_form_df["Average reading score"]=district_form_df["Average reading score"].map("{:.2f}".format)
district_form_df["Passing math (%)"]=district_form_df["Passing math (%)"].map("{:.2f}".format)
district_form_df["Passing reading (%)"]=district_form_df["Passing reading (%)"].map("{:.2f}".format)
district_form_df["Overall passing rate (%)"]=district_form_df["Overall passing rate (%)"].map("{:.2f}".format)
district_form_df

district_form_df

Unnamed: 0,Total schools,Total students,Total budget,Average math score,Average reading score,Passing math (%),Passing reading (%),Overall passing rate (%)
0,15,39170,"$24,649,428",78.99,81.88,74.98,85.81,80.39


# 2. School Summary


In [8]:
#Group by school
grouped_df=city_df.groupby("school_name")

#retrieve school type and make df with correct index and column name
school_type=schools_df["type"]
school_type_df=pd.DataFrame(school_type).set_index(schools_df["school_name"]).rename(columns={"type":"Type"})

#calculate total students and make df with correct column name
school_students=grouped_df["student_name"].count()
school_students_df=pd.DataFrame(school_students).rename(columns={"student_name":"Total students"})

#retrieve total budget and make df with correct index and column name
school_budget= schools_df["budget"]
school_budget_df=pd.DataFrame(school_budget).set_index(schools_df["school_name"])\
                        .rename(columns={"budget":"Total budget"})

#calculate budget per student and make df with correct index and column name
budget_per_student= school_budget_df["Total budget"]/school_students_df["Total students"]
budget_per_student_df=pd.DataFrame(budget_per_student).rename(columns={0:"Per student budget"})

#printing results to check
#school_type_df, school_students_df, school_budget_df, budget_per_student_df

In [9]:
#define function to calculate passing rate for each subject

def info_2(x, x_score):
    passing_x_df= city_df.loc[city_df[x_score] >= 70]
    grouped_x= passing_x_df.groupby("school_name")
    passing_x= grouped_x["student_name"].count()

    passing_x_perc= round((passing_x/school_students)*100,2)
    return passing_x_perc


In [10]:
#MATH
avg_math_score= round(grouped_df["math_score"].mean(),2)
avg_math_score_df=pd.DataFrame(avg_math_score).rename(columns={"math_score":"Average math score"})

passing_math_perc= info_2("math", "math_score")
passing_math_perc_df=pd.DataFrame(passing_math_perc).rename(columns={"student_name":"Passing math (%)"})

#printing results to check
#avg_math_score_df, passing_math_perc_df

In [11]:
#READING
avg_read_score= round(grouped_df["reading_score"].mean(),2)
avg_read_score_df=pd.DataFrame(avg_read_score).rename(columns={"reading_score":"Average reading score"})

passing_read_perc= info_2("read", "reading_score")
passing_read_perc_df=pd.DataFrame(passing_read_perc).rename(columns={"student_name":"Passing reading (%)"})

#printing results to check
#avg_read_score_df, passing_read_perc_df

In [12]:
#Calculate the overall passing rate 
avg_score= round((passing_math_perc+passing_read_perc)/2,2)
avg_score_df=pd.DataFrame(avg_score).rename(columns={"student_name":"Overall passing rate (%)"})

#printing results to check
#avg_score_df

In [13]:
#school summary
school_summary_df=pd.merge(school_type_df, school_students_df, on="school_name").merge(school_budget_df, 
                    on="school_name").merge(budget_per_student_df, on="school_name").merge(avg_math_score_df, 
                    on= "school_name").merge(avg_read_score_df, on= "school_name").merge(passing_math_perc_df, 
                    on= "school_name").merge(passing_read_perc_df, on= "school_name").merge(avg_score_df,on= "school_name")

#school summary formatting
school_form_df=school_summary_df[["Type","Total students","Total budget","Per student budget","Average math score",
                                    "Average reading score", "Passing math (%)", "Passing reading (%)", 
                                  "Overall passing rate (%)"]]
school_form_df["Total students"]=school_form_df["Total students"].map("{:,}".format)
school_form_df["Total budget"]=school_form_df["Total budget"].map("${:,}".format)
school_form_df["Per student budget"]=school_form_df["Per student budget"].map("${:,}".format)
school_form_df["Average math score"]=school_form_df["Average math score"].map("{:.2f}".format)
school_form_df["Average reading score"]=school_form_df["Average reading score"].map("{:.2f}".format)
school_form_df["Passing math (%)"]=school_form_df["Passing math (%)"].map("{:.2f}".format)
school_form_df["Passing reading (%)"]=school_form_df["Passing reading (%)"].map("{:.2f}".format)
school_form_df["Overall passing rate (%)"]=school_form_df["Overall passing rate (%)"].map("{:.2f}".format)
school_form_df

school_form_df

Unnamed: 0_level_0,Type,Total students,Total budget,Per student budget,Average math score,Average reading score,Passing math (%),Passing reading (%),Overall passing rate (%)
school_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Huang High School,District,2917,"$1,910,635",$655.0,76.63,81.18,65.68,81.32,73.5
Figueroa High School,District,2949,"$1,884,411",$639.0,76.71,81.16,65.99,80.74,73.36
Shelton High School,Charter,1761,"$1,056,600",$600.0,83.36,83.73,93.87,95.85,94.86
Hernandez High School,District,4635,"$3,022,020",$652.0,77.29,80.93,66.75,80.86,73.81
Griffin High School,Charter,1468,"$917,500",$625.0,83.35,83.82,93.39,97.14,95.26
Wilson High School,Charter,2283,"$1,319,574",$578.0,83.27,83.99,93.87,96.54,95.21
Cabrera High School,Charter,1858,"$1,081,356",$582.0,83.06,83.98,94.13,97.04,95.58
Bailey High School,District,4976,"$3,124,928",$628.0,77.05,81.03,66.68,81.93,74.31
Holden High School,Charter,427,"$248,087",$581.0,83.8,83.81,92.51,96.25,94.38
Pena High School,Charter,962,"$585,858",$609.0,83.84,84.04,94.59,95.95,95.27


# 3. Top Performing Schools (By Passing Rate)

In [14]:
top_school_df= school_summary_df.sort_values("Overall passing rate (%)", ascending=False)

top_school_df["Total students"]=top_school_df["Total students"].map("{:,}".format)
top_school_df["Total budget"]=top_school_df["Total budget"].map("${:,}".format)
top_school_df["Per student budget"]=top_school_df["Per student budget"].map("${:,}".format)

top_school_df.head(5)

Unnamed: 0_level_0,Type,Total students,Total budget,Per student budget,Average math score,Average reading score,Passing math (%),Passing reading (%),Overall passing rate (%)
school_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Cabrera High School,Charter,1858,"$1,081,356",$582.0,83.06,83.98,94.13,97.04,95.58
Thomas High School,Charter,1635,"$1,043,130",$638.0,83.42,83.85,93.27,97.31,95.29
Pena High School,Charter,962,"$585,858",$609.0,83.84,84.04,94.59,95.95,95.27
Griffin High School,Charter,1468,"$917,500",$625.0,83.35,83.82,93.39,97.14,95.26
Wilson High School,Charter,2283,"$1,319,574",$578.0,83.27,83.99,93.87,96.54,95.21


# 4. Bottom Performing Schools (By Passing Rate)

In [15]:
bottom_school_df= school_summary_df.sort_values("Overall passing rate (%)")
bottom_school_df["Total students"]=bottom_school_df["Total students"].map("{:,}".format)
bottom_school_df["Total budget"]=bottom_school_df["Total budget"].map("${:,}".format)
bottom_school_df["Per student budget"]=bottom_school_df["Per student budget"].map("${:,}".format)

bottom_school_df.head(5)

Unnamed: 0_level_0,Type,Total students,Total budget,Per student budget,Average math score,Average reading score,Passing math (%),Passing reading (%),Overall passing rate (%)
school_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Rodriguez High School,District,3999,"$2,547,363",$637.0,76.84,80.74,66.37,80.22,73.3
Figueroa High School,District,2949,"$1,884,411",$639.0,76.71,81.16,65.99,80.74,73.36
Huang High School,District,2917,"$1,910,635",$655.0,76.63,81.18,65.68,81.32,73.5
Johnson High School,District,4761,"$3,094,650",$650.0,77.07,80.97,66.06,81.22,73.64
Hernandez High School,District,4635,"$3,022,020",$652.0,77.29,80.93,66.75,80.86,73.81


# 5. Math Scores by Grade

In [16]:
#define function to calculate info for each grade

def info_grade(x, subject):
    grade_x_df= city_df.loc[city_df["grade"] == x]
    grade_x_df=grade_x_df.groupby("school_name")
    
    if subject == "math":
        math_x_avg= round(grade_x_df["math_score"].mean(),2)            
        return math_x_avg
    else:
        read_x_avg= round(grade_x_df["reading_score"].mean(),2)            
        return read_x_avg


In [17]:
#math score summary
math_9th_avg=round(info_grade("9th", "math"),2)
math_9th_avg_df=pd.DataFrame(math_9th_avg).rename(columns={"math_score": "9th"})

math_10th_avg=info_grade("10th", "math")
math_10th_avg_df=pd.DataFrame(math_10th_avg).rename(columns={"math_score": "10th"})

math_11th_avg=info_grade("11th", "math")
math_11th_avg_df=pd.DataFrame(math_11th_avg).rename(columns={"math_score": "11th"})

math_12th_avg=info_grade("12th", "math")
math_12th_avg_df=pd.DataFrame(math_12th_avg).rename(columns={"math_score": "12th"})
    
math_summary_df=pd.merge(math_9th_avg_df, math_10th_avg_df, on="school_name").merge(math_11th_avg_df, on="school_name")\
                .merge(math_12th_avg_df, on="school_name")

math_summary_df

Unnamed: 0_level_0,9th,10th,11th,12th
school_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Bailey High School,77.08,77.0,77.52,76.49
Cabrera High School,83.09,83.15,82.77,83.28
Figueroa High School,76.4,76.54,76.88,77.15
Ford High School,77.36,77.67,76.92,76.18
Griffin High School,82.04,84.23,83.84,83.36
Hernandez High School,77.44,77.34,77.14,77.19
Holden High School,83.79,83.43,85.0,82.86
Huang High School,77.03,75.91,76.45,77.23
Johnson High School,77.19,76.69,77.49,76.86
Pena High School,83.63,83.37,84.33,84.12


# 6. Reading Scores by Grade

In [18]:
#reading score summary

read_9th_avg=info_grade("9th", "read")
read_9th_avg_df=pd.DataFrame(read_9th_avg).rename(columns={"reading_score": "9th"})

read_10th_avg=info_grade("10th", "read")
read_10th_avg_df=pd.DataFrame(read_10th_avg).rename(columns={"reading_score": "10th"})

read_11th_avg=info_grade("11th", "read")
read_11th_avg_df=pd.DataFrame(read_11th_avg).rename(columns={"reading_score": "11th"})

read_12th_avg=info_grade("12th", "read")
read_12th_avg_df=pd.DataFrame(read_12th_avg).rename(columns={"reading_score": "12th"})


read_summary_df=pd.merge(read_9th_avg_df, read_10th_avg_df, on="school_name").merge(read_11th_avg_df, on="school_name")\
                .merge(read_12th_avg_df, on="school_name")

read_summary_df

Unnamed: 0_level_0,9th,10th,11th,12th
school_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Bailey High School,81.3,80.91,80.95,80.91
Cabrera High School,83.68,84.25,83.79,84.29
Figueroa High School,81.2,81.41,80.64,81.38
Ford High School,80.63,81.26,80.4,80.66
Griffin High School,83.37,83.71,84.29,84.01
Hernandez High School,80.87,80.66,81.4,80.86
Holden High School,83.68,83.32,83.82,84.7
Huang High School,81.29,81.51,81.42,80.31
Johnson High School,81.26,80.77,80.62,81.23
Pena High School,83.81,83.61,84.34,84.59


# 7. Scores by School Spending

In [19]:
#find minimum and maximum size to create bins
min_budget= school_summary_df["Per student budget"].min()
max_budget= school_summary_df["Per student budget"].max()

#print results to check
#max_budget, min_budget, max_budget/4

In [20]:
#create budget ranges

bins= [0, 600, 625, 650, 675]
group_names= ["<$600", "$600-625", "$625-650", "$650-675"]
    
school_summary_df["Per student budget ranges"]= pd.cut(school_summary_df["Per student budget"],bins,
                labels=group_names, include_lowest= True)

#print to check
#school_summary_df

In [21]:
#create summary table
school_spending_df= school_summary_df[["Per student budget ranges", "Average math score",
                                    "Average reading score", "Passing math (%)", "Passing reading (%)",
                                    "Overall passing rate (%)"]]
spending_summary_df=round(school_spending_df.groupby("Per student budget ranges").mean(),2)
spending_summary_df

Unnamed: 0_level_0,Average math score,Average reading score,Passing math (%),Passing reading (%),Overall passing rate (%)
Per student budget ranges,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
<$600,83.43,83.89,93.54,96.46,95.0
$600-625,83.6,83.93,93.99,96.54,95.26
$625-650,78.03,81.42,71.11,83.45,77.29
$650-675,76.96,81.06,66.22,81.09,73.66


# 8. Scores by School Size

In [22]:
#find minimum and maximum per student budget to create bins
min_size= school_summary_df["Total students"].min()
max_size= school_summary_df["Total students"].max()

#print results to check
#max_size, min_size, max_size/3

In [23]:
#create size ranges

bins= [0, 1000, 2500, 5000]

group_names= ["Small (<1000)", "Medium (1000-2500)", "Large (2500-5000)"]
    
school_summary_df["School size ranges"]= pd.cut(school_summary_df["Total students"],bins,
                                   labels=group_names, include_lowest= True)
#print to check
#school_summary_df

In [24]:
#create summary table
school_size_df= school_summary_df[["School size ranges", "Average math score",
                                "Average reading score", "Passing math (%)", "Passing reading (%)",
                                "Overall passing rate (%)"]]
size_summary_df=round(school_size_df.groupby("School size ranges").mean(),2)
size_summary_df

Unnamed: 0_level_0,Average math score,Average reading score,Passing math (%),Passing reading (%),Overall passing rate (%)
School size ranges,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Small (<1000),83.82,83.93,93.55,96.1,94.82
Medium (1000-2500),83.36,83.89,93.64,96.75,95.2
Large (2500-5000),76.96,80.97,66.55,80.8,73.68


# 9. Scores by School Type

In [25]:
school_type_df= school_summary_df[["Type", "Average math score",
                                "Average reading score", "Passing math (%)", "Passing reading (%)",
                                "Overall passing rate (%)"]]

school_type_df=school_type_df.rename(columns={"Type": "School type"})
type_summary_df= round(school_type_df.groupby("School type").mean(),2)
type_summary_df

Unnamed: 0_level_0,Average math score,Average reading score,Passing math (%),Passing reading (%),Overall passing rate (%)
School type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Charter,83.47,83.9,93.62,96.59,95.1
District,76.96,80.97,66.55,80.8,73.68
