# Import dependencies and setup

In [1]:
#import dependencies
import pandas as pd

#create a reference to cvs schools file and import it into a pandas df
schools="Resources/schools_complete.csv"
schools_df=pd.read_csv(schools)
    #schools_df.head()

#create a reference to cvs students file and import it into a pandas df
students="Resources/students_complete.csv"
students_df=pd.read_csv(students)
    #students_df.head()

#create a unified dataframe
city_df=pd.merge(schools_df, students_df, on="school_name")

#display 
city_df.head()

Unnamed: 0,School ID,school_name,type,size,budget,Student ID,student_name,gender,grade,reading_score,math_score
0,0,Huang High School,District,2917,1910635,0,Paul Bradley,M,9th,66,79
1,0,Huang High School,District,2917,1910635,1,Victor Smith,M,12th,94,61
2,0,Huang High School,District,2917,1910635,2,Kevin Rodriguez,M,12th,90,60
3,0,Huang High School,District,2917,1910635,3,Dr. Richard Scott,M,12th,67,58
4,0,Huang High School,District,2917,1910635,4,Bonnie Ray,F,9th,97,84


# 1. District Summary


In [2]:
#calculate total number of schools
tot_schools= city_df["school_name"].nunique()

#calculate total number of students
tot_students= city_df["student_name"].count()

#calculate total budget
tot_budget= schools_df["budget"].sum()

#display results to check
#tot_schools, tot_students,tot_budget

In [3]:
#define function to calculate passing rate for each subject

def rate(x, x_score):
    passing_x_df= city_df.loc[city_df[x_score] >= 70]
    passing_x= passing_x_df["student_name"].count()
    passing_x_perc= (passing_x/tot_students)*100
    return passing_x_perc
        

In [4]:
#MATH: calculate average score and passing rate
avg_math_score= city_df["math_score"].mean()
passing_math_perc=rate("math", "math_score")

#display results to check
#avg_math_score, passing_math_perc

In [5]:
#READING: calculate average score and passing rate
avg_read_score= city_df["reading_score"].mean()
passing_read_perc=rate("read", "reading_score")

#display results to check
#avg_read_score, passing_read_perc

In [6]:
#calculate the overall passing rate (average between math and reading passing rate)
avg_score= (passing_math_perc+passing_read_perc)/2

#display results to check
#avg_score 

In [7]:
#generate district summary table

district_summary_df=pd.DataFrame({
    "Total schools" : [tot_schools],
    "Total students" : [tot_students],
    "Total budget" : [tot_budget],
    "Average math score" : avg_math_score,
    "Average reading score" : avg_read_score,
    "Passing math (%)" : passing_math_perc,
    "Passing reading (%)" : passing_read_perc,
    "Overall passing rate (%)" : avg_score})

#change district summary formatting
district_form_df=district_summary_df[["Total schools","Total students","Total budget","Average math score",
                                    "Average reading score", "Passing math (%)", "Passing reading (%)", 
                                      "Overall passing rate (%)"]]
district_form_df["Total students"]=district_form_df["Total students"].map("{:,}".format)
district_form_df["Total budget"]=district_form_df["Total budget"].map("${:,}".format)
district_form_df["Average math score"]=district_form_df["Average math score"].map("{:.2f}".format)
district_form_df["Average reading score"]=district_form_df["Average reading score"].map("{:.2f}".format)
district_form_df["Passing math (%)"]=district_form_df["Passing math (%)"].map("{:.2f}".format)
district_form_df["Passing reading (%)"]=district_form_df["Passing reading (%)"].map("{:.2f}".format)
district_form_df["Overall passing rate (%)"]=district_form_df["Overall passing rate (%)"].map("{:.2f}".format)


district_form_df=district_form_df.rename(index={0:""})

#display formatted district summary
district_form_df

Unnamed: 0,Total schools,Total students,Total budget,Average math score,Average reading score,Passing math (%),Passing reading (%),Overall passing rate (%)
,15,39170,"$24,649,428",78.99,81.88,74.98,85.81,80.39


# 2. School Summary


In [8]:
#group unified df by school
grouped_df=city_df.groupby("school_name")

#retrieve school type and generate df with correct index and column name
school_type=schools_df["type"]
school_type_df=pd.DataFrame(school_type).set_index(schools_df["school_name"]).rename(columns={"type":"Type"})

#calculate total students per school and generate df with correct column name
school_students=grouped_df["student_name"].count()
school_students_df=pd.DataFrame(school_students).rename(columns={"student_name":"Total students"})

#retrieve total budget per school and generate df with correct index and column name
school_budget= schools_df["budget"]
school_budget_df=pd.DataFrame(school_budget).set_index(schools_df["school_name"])\
                        .rename(columns={"budget":"Total budget"})

#calculate budget per student per school and generate df with correct index and column name
budget_per_student= school_budget_df["Total budget"]/school_students_df["Total students"]
budget_per_student_df=pd.DataFrame(budget_per_student).rename(columns={0:"Per student budget"})

#display results to check
#school_type_df, school_students_df, school_budget_df, budget_per_student_df

In [9]:
#define function to calculate passing rate for each subject (per school)

def school_rate(x, x_score):
    passing_x_df= city_df.loc[city_df[x_score] >= 70]
    grouped_x= passing_x_df.groupby("school_name")
    passing_x= grouped_x["student_name"].count()

    passing_x_perc= (passing_x/school_students)*100
    return passing_x_perc


In [10]:
#MATH: calculate average score and passing rate (per school)
avg_math_score= grouped_df["math_score"].mean()
avg_math_score_df=pd.DataFrame(avg_math_score).rename(columns={"math_score":"Average math score"})

passing_math_perc= school_rate("math", "math_score")
passing_math_perc_df=pd.DataFrame(passing_math_perc).rename(columns={"student_name":"Passing math (%)"})

#display results to check
#avg_math_score_df, passing_math_perc_df

In [11]:
#READING: calculate average score and passing rate (per school)
avg_read_score= grouped_df["reading_score"].mean()
avg_read_score_df=pd.DataFrame(avg_read_score).rename(columns={"reading_score":"Average reading score"})

passing_read_perc= school_rate("read", "reading_score")
passing_read_perc_df=pd.DataFrame(passing_read_perc).rename(columns={"student_name":"Passing reading (%)"})

#display results to check
#avg_read_score_df, passing_read_perc_df

In [12]:
#Calculate the overall passing rate (per school)
avg_score= (passing_math_perc+passing_read_perc)/2
avg_score_df=pd.DataFrame(avg_score).rename(columns={"student_name":"Overall passing rate (%)"})

#display results to check
#avg_score_df

In [13]:
#generate school summary
school_summary_df=pd.merge(school_type_df, school_students_df, on="school_name").merge(school_budget_df, 
                    on="school_name").merge(budget_per_student_df, on="school_name").merge(avg_math_score_df, 
                    on= "school_name").merge(avg_read_score_df, on= "school_name").merge(passing_math_perc_df, 
                    on= "school_name").merge(passing_read_perc_df, on= "school_name").merge(avg_score_df,on= "school_name")

#school summary formatting
school_form_df=school_summary_df[["Type","Total students","Total budget","Per student budget","Average math score",
                                    "Average reading score", "Passing math (%)", "Passing reading (%)", 
                                  "Overall passing rate (%)"]]
school_form_df["Total students"]=school_form_df["Total students"].map("{:,}".format)
school_form_df["Total budget"]=school_form_df["Total budget"].map("${:,}".format)
school_form_df["Per student budget"]=school_form_df["Per student budget"].map("${:,}".format)
school_form_df["Average math score"]=school_form_df["Average math score"].map("{:.2f}".format)
school_form_df["Average reading score"]=school_form_df["Average reading score"].map("{:.2f}".format)
school_form_df["Passing math (%)"]=school_form_df["Passing math (%)"].map("{:.2f}".format)
school_form_df["Passing reading (%)"]=school_form_df["Passing reading (%)"].map("{:.2f}".format)
school_form_df["Overall passing rate (%)"]=school_form_df["Overall passing rate (%)"].map("{:.2f}".format)


school_form_df=school_form_df.rename_axis('School name')

#display formatted school summary
school_form_df

Unnamed: 0_level_0,Type,Total students,Total budget,Per student budget,Average math score,Average reading score,Passing math (%),Passing reading (%),Overall passing rate (%)
School name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Huang High School,District,2917,"$1,910,635",$655.0,76.63,81.18,65.68,81.32,73.5
Figueroa High School,District,2949,"$1,884,411",$639.0,76.71,81.16,65.99,80.74,73.36
Shelton High School,Charter,1761,"$1,056,600",$600.0,83.36,83.73,93.87,95.85,94.86
Hernandez High School,District,4635,"$3,022,020",$652.0,77.29,80.93,66.75,80.86,73.81
Griffin High School,Charter,1468,"$917,500",$625.0,83.35,83.82,93.39,97.14,95.27
Wilson High School,Charter,2283,"$1,319,574",$578.0,83.27,83.99,93.87,96.54,95.2
Cabrera High School,Charter,1858,"$1,081,356",$582.0,83.06,83.98,94.13,97.04,95.59
Bailey High School,District,4976,"$3,124,928",$628.0,77.05,81.03,66.68,81.93,74.31
Holden High School,Charter,427,"$248,087",$581.0,83.8,83.81,92.51,96.25,94.38
Pena High School,Charter,962,"$585,858",$609.0,83.84,84.04,94.59,95.95,95.27


# 3. Top Performing Schools (By Passing Rate)

In [14]:
#sort schools by overall passing rate (discending)
top_school_df= school_summary_df.sort_values("Overall passing rate (%)", ascending=False)
top_school_df["Total students"]=top_school_df["Total students"].map("{:,}".format)
top_school_df["Total budget"]=top_school_df["Total budget"].map("${:,}".format)
top_school_df["Per student budget"]=top_school_df["Per student budget"].map("${:,}".format)
top_school_df["Average math score"]=top_school_df["Average math score"].map("{:.2f}".format)
top_school_df["Average reading score"]=top_school_df["Average reading score"].map("{:.2f}".format)
top_school_df["Passing math (%)"]=top_school_df["Passing math (%)"].map("{:.2f}".format)
top_school_df["Passing reading (%)"]=top_school_df["Passing reading (%)"].map("{:.2f}".format)
top_school_df["Overall passing rate (%)"]=top_school_df["Overall passing rate (%)"].map("{:.2f}".format)
top_school_df=top_school_df.rename_axis('School name')

#display top 5 schools (formatted)
top_school_df.head()

Unnamed: 0_level_0,Type,Total students,Total budget,Per student budget,Average math score,Average reading score,Passing math (%),Passing reading (%),Overall passing rate (%)
School name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Cabrera High School,Charter,1858,"$1,081,356",$582.0,83.06,83.98,94.13,97.04,95.59
Thomas High School,Charter,1635,"$1,043,130",$638.0,83.42,83.85,93.27,97.31,95.29
Pena High School,Charter,962,"$585,858",$609.0,83.84,84.04,94.59,95.95,95.27
Griffin High School,Charter,1468,"$917,500",$625.0,83.35,83.82,93.39,97.14,95.27
Wilson High School,Charter,2283,"$1,319,574",$578.0,83.27,83.99,93.87,96.54,95.2


# 4. Bottom Performing Schools (By Passing Rate)

In [15]:
#sort schools by overall passing rate (ascending)
bottom_school_df= school_summary_df.sort_values("Overall passing rate (%)")
bottom_school_df["Total students"]=bottom_school_df["Total students"].map("{:,}".format)
bottom_school_df["Total budget"]=bottom_school_df["Total budget"].map("${:,}".format)
bottom_school_df["Per student budget"]=bottom_school_df["Per student budget"].map("${:,}".format)
bottom_school_df["Average math score"]=bottom_school_df["Average math score"].map("{:.2f}".format)
bottom_school_df["Average reading score"]=bottom_school_df["Average reading score"].map("{:.2f}".format)
bottom_school_df["Passing math (%)"]=bottom_school_df["Passing math (%)"].map("{:.2f}".format)
bottom_school_df["Passing reading (%)"]=bottom_school_df["Passing reading (%)"].map("{:.2f}".format)
bottom_school_df["Overall passing rate (%)"]=bottom_school_df["Overall passing rate (%)"].map("{:.2f}".format)
bottom_school_df=bottom_school_df.rename_axis('School name')


#display bottom 5 schools (formatted)
bottom_school_df.head()

Unnamed: 0_level_0,Type,Total students,Total budget,Per student budget,Average math score,Average reading score,Passing math (%),Passing reading (%),Overall passing rate (%)
School name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Rodriguez High School,District,3999,"$2,547,363",$637.0,76.84,80.74,66.37,80.22,73.29
Figueroa High School,District,2949,"$1,884,411",$639.0,76.71,81.16,65.99,80.74,73.36
Huang High School,District,2917,"$1,910,635",$655.0,76.63,81.18,65.68,81.32,73.5
Johnson High School,District,4761,"$3,094,650",$650.0,77.07,80.97,66.06,81.22,73.64
Ford High School,District,2739,"$1,763,916",$644.0,77.1,80.75,68.31,79.3,73.8


# 5. Math Scores by Grade

In [16]:
#define function to calculate info for each grade

def info_grade(x, subject):
    grade_x_df= city_df.loc[city_df["grade"] == x]
    grade_x_df=grade_x_df.groupby("school_name")
    
    if subject == "math":
        math_x_avg= grade_x_df["math_score"].mean()
        return math_x_avg
    else:
        read_x_avg= grade_x_df["reading_score"].mean()
        return read_x_avg


In [17]:
#MATH: calculate average grade for 9th, 10th, 11th, and 12th grade

math_9th_avg=info_grade("9th", "math")
math_9th_avg_df=pd.DataFrame(math_9th_avg).rename(columns={"math_score": "9th"})

math_10th_avg=info_grade("10th", "math")
math_10th_avg_df=pd.DataFrame(math_10th_avg).rename(columns={"math_score": "10th"})

math_11th_avg=info_grade("11th", "math")
math_11th_avg_df=pd.DataFrame(math_11th_avg).rename(columns={"math_score": "11th"})

math_12th_avg=info_grade("12th", "math")
math_12th_avg_df=pd.DataFrame(math_12th_avg).rename(columns={"math_score": "12th"})
 
    
#generate math summary table   
math_summary_df=pd.merge(math_9th_avg_df, math_10th_avg_df, on="school_name").merge(math_11th_avg_df, on="school_name")\
                .merge(math_12th_avg_df, on="school_name")

math_summary_df=math_summary_df.rename_axis('School name')

#display
math_summary_df

Unnamed: 0_level_0,9th,10th,11th,12th
School name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Bailey High School,77.083676,76.996772,77.515588,76.492218
Cabrera High School,83.094697,83.154506,82.76556,83.277487
Figueroa High School,76.403037,76.539974,76.884344,77.151369
Ford High School,77.361345,77.672316,76.918058,76.179963
Griffin High School,82.04401,84.229064,83.842105,83.356164
Hernandez High School,77.438495,77.337408,77.136029,77.186567
Holden High School,83.787402,83.429825,85.0,82.855422
Huang High School,77.027251,75.908735,76.446602,77.225641
Johnson High School,77.187857,76.691117,77.491653,76.863248
Pena High School,83.625455,83.372,84.328125,84.121547


# 6. Reading Scores by Grade

In [18]:
#READING: calculate average grade for 9th, 10th, 11th, and 12th grade

read_9th_avg=info_grade("9th", "read")
read_9th_avg_df=pd.DataFrame(read_9th_avg).rename(columns={"reading_score": "9th"})

read_10th_avg=info_grade("10th", "read")
read_10th_avg_df=pd.DataFrame(read_10th_avg).rename(columns={"reading_score": "10th"})

read_11th_avg=info_grade("11th", "read")
read_11th_avg_df=pd.DataFrame(read_11th_avg).rename(columns={"reading_score": "11th"})

read_12th_avg=info_grade("12th", "read")
read_12th_avg_df=pd.DataFrame(read_12th_avg).rename(columns={"reading_score": "12th"})

#generate reading summary table   
read_summary_df=pd.merge(read_9th_avg_df, read_10th_avg_df, on="school_name").merge(read_11th_avg_df, on="school_name")\
                .merge(read_12th_avg_df, on="school_name")

read_summary_df=read_summary_df.rename_axis('School name')

#display
read_summary_df

Unnamed: 0_level_0,9th,10th,11th,12th
School name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Bailey High School,81.303155,80.907183,80.945643,80.912451
Cabrera High School,83.676136,84.253219,83.788382,84.287958
Figueroa High School,81.198598,81.408912,80.640339,81.384863
Ford High School,80.632653,81.262712,80.403642,80.662338
Griffin High School,83.369193,83.706897,84.288089,84.013699
Hernandez High School,80.86686,80.660147,81.39614,80.857143
Holden High School,83.677165,83.324561,83.815534,84.698795
Huang High School,81.290284,81.512386,81.417476,80.305983
Johnson High School,81.260714,80.773431,80.616027,81.227564
Pena High School,83.807273,83.612,84.335938,84.59116


# 7. Scores by School Spending

In [19]:
#find minimum and maximum school size to create bins
min_budget= school_summary_df["Per student budget"].min()
max_budget= school_summary_df["Per student budget"].max()

#display results to check
#max_budget, min_budget, max_budget/4

In [20]:
#create budget ranges
bins= [0, 600, 625, 650, 675]
group_names= ["<$600", "$600-625", "$625-650", "$650-675"]
    
school_summary_df["Per student budget ranges"]= pd.cut(school_summary_df["Per student budget"],bins,
                labels=group_names, include_lowest= True)

#display to check
#school_summary_df

In [21]:
#create summary table
school_spending_df= school_summary_df[["Per student budget ranges", "Average math score",
                                    "Average reading score", "Passing math (%)", "Passing reading (%)",
                                    "Overall passing rate (%)"]]
spending_summary_df= school_spending_df.groupby("Per student budget ranges").mean()

#display
spending_summary_df

Unnamed: 0_level_0,Average math score,Average reading score,Passing math (%),Passing reading (%),Overall passing rate (%)
Per student budget ranges,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
<$600,83.43621,83.892196,93.541501,96.459627,95.000564
$600-625,83.595708,83.930728,93.993483,96.542455,95.267969
$625-650,78.032719,81.416375,71.112408,83.453814,77.283111
$650-675,76.959583,81.058567,66.218444,81.08971,73.654077


# 8. Scores by School Size

In [22]:
#find minimum and maximum per student budget to create bins
min_size= school_summary_df["Total students"].min()
max_size= school_summary_df["Total students"].max()

#display results to check
#max_size, min_size, max_size/3

In [23]:
#create size ranges

bins= [0, 1000, 2500, 5000]
group_names= ["Small (<1000)", "Medium (1000-2500)", "Large (2500-5000)"]
    
school_summary_df["School size ranges"]= pd.cut(school_summary_df["Total students"],bins,
                                   labels=group_names, include_lowest= True)
#display to check
#school_summary_df

In [24]:
#create summary table
school_size_df= school_summary_df[["School size ranges", "Average math score",
                                "Average reading score", "Passing math (%)", "Passing reading (%)",
                                "Overall passing rate (%)"]]
size_summary_df=school_size_df.groupby("School size ranges").mean()

#display
size_summary_df

Unnamed: 0_level_0,Average math score,Average reading score,Passing math (%),Passing reading (%),Overall passing rate (%)
School size ranges,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Small (<1000),83.821598,83.929843,93.550225,96.099437,94.824831
Medium (1000-2500),83.357937,83.88528,93.644365,96.74884,95.196603
Large (2500-5000),76.956733,80.966636,66.548453,80.799062,73.673757


# 9. Scores by School Type

In [25]:
#create summary table
school_type_df= school_summary_df[["Type", "Average math score",
                                "Average reading score", "Passing math (%)", "Passing reading (%)",
                                "Overall passing rate (%)"]]

school_type_df=school_type_df.rename(columns={"Type": "School type"})
type_summary_df= school_type_df.groupby("School type").mean()

#display
type_summary_df

Unnamed: 0_level_0,Average math score,Average reading score,Passing math (%),Passing reading (%),Overall passing rate (%)
School type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Charter,83.473852,83.896421,93.62083,96.586489,95.10366
District,76.956733,80.966636,66.548453,80.799062,73.673757
