# PyCity Schools Analysis

In [1]:
# Import dependencies
import pandas as pd
import numpy as np

### Treating schools file

In [2]:
# Read schools file
df_schools = pd.read_csv('./datasource/schools_complete.csv')
df_schools.head() 

Unnamed: 0,School ID,school_name,type,size,budget
0,0,Huang High School,District,2917,1910635
1,1,Figueroa High School,District,2949,1884411
2,2,Shelton High School,Charter,1761,1056600
3,3,Hernandez High School,District,4635,3022020
4,4,Griffin High School,Charter,1468,917500


In [3]:
# Summarize Schools data.
dic = {'School ID': 'nunique', 'size':'sum' , 'budget':'sum'}
sum_schools = pd.DataFrame(df_schools.agg(dic)).T

# Rename columns
rename_cols = {"School ID":"Total Schools", "size":"Total Students", "budget":"Total Budget"}
sum_schools.rename(columns=rename_cols, inplace=True)
sum_schools

Unnamed: 0,Total Schools,Total Students,Total Budget
0,15,39170,24649428


### Treating students file

In [4]:
# Read students file
df_students = pd.read_csv('./datasource/students_complete.csv')

# Add columns for pass math and pass reading
df_students['pass_math'] = df_students['math_score'].apply(lambda score: 1 if score >=70 else 0)
df_students['pass_read'] = df_students['reading_score'].apply(lambda score: 1 if score >=70 else 0)
df_students.head()

Unnamed: 0,Student ID,student_name,gender,grade,school_name,reading_score,math_score,pass_math,pass_read
0,0,Paul Bradley,M,9th,Huang High School,66,79,1,0
1,1,Victor Smith,M,12th,Huang High School,94,61,0,1
2,2,Kevin Rodriguez,M,12th,Huang High School,90,60,0,1
3,3,Dr. Richard Scott,M,12th,Huang High School,67,58,0,0
4,4,Bonnie Ray,F,9th,Huang High School,97,84,1,1


In [5]:
# Summarize Students data.
dic = {"Student ID": "nunique", "math_score":"mean",  'reading_score':'mean',  'pass_math':'sum',  'pass_read':'sum'}
sum_students = pd.DataFrame(df_students.agg(dic)).T
# Rename columns
rename_cols = {"Student ID":"Total Students", "math_score":"Average Math Score", "reading_score":"Average Reading Score"}
sum_students.rename(columns=rename_cols, inplace=True) 


In [6]:
# Compute Students Percentages
sum_students["% Passing Math"] = sum_students["pass_math"] / sum_students['Total Students'] * 100
sum_students["% Passing Reading"] =  sum_students["pass_read"] / sum_students['Total Students'] * 100
# Really??? I belive "% Overall Passing Rate" should be 
# ( "% Overall Passing Rate Passing Math" + "% Passing Reading")/2, am I right?
sum_students["% Overall Passing Rate"] = (sum_students["Average Math Score"] + sum_students["Average Reading Score"]) / 2
# sum_students

# District Summary

In [7]:
# Merge Schools and Students summaries 
district_summary = pd.merge(sum_schools , sum_students, on="Total Students", how="outer")

# Format before print
district_summary["Total Budget"] = district_summary["Total Budget"].astype(float).map("${:,.2f}".format)
district_summary["Total Students"] = district_summary["Total Students"].astype(int).map("{:,.0f}".format)
district_summary["% Passing Math"] = district_summary["% Passing Math"].astype(float).map("{:.2f}%".format)
district_summary["% Passing Reading"] = district_summary["% Passing Reading"].astype(float).map("{:.2f}%".format)
district_summary["% Overall Passing Rate"] = district_summary["% Overall Passing Rate"].astype(float).map("{:.2f}%".format)
district_summary["Average Math Score"] = district_summary["Average Math Score"].astype(float).map("{:,.3f}".format)
district_summary["Average Reading Score"] = district_summary["Average Reading Score"].astype(float).map("{:,.3f}".format)

# Drop unnecessary columns
district_summary.drop(['pass_math','pass_read'], axis=1, inplace=True)
district_summary

Unnamed: 0,Total Schools,Total Students,Total Budget,Average Math Score,Average Reading Score,% Passing Math,% Passing Reading,% Overall Passing Rate
0,15,39170,"$24,649,428.00",78.985,81.878,74.98%,85.81%,80.43%


# School Summary

In [8]:
# Merging main tables
main_tbl = pd.merge(df_schools, df_students, how="left", on="school_name")
main_tbl.head()

Unnamed: 0,School ID,school_name,type,size,budget,Student ID,student_name,gender,grade,reading_score,math_score,pass_math,pass_read
0,0,Huang High School,District,2917,1910635,0,Paul Bradley,M,9th,66,79,1,0
1,0,Huang High School,District,2917,1910635,1,Victor Smith,M,12th,94,61,0,1
2,0,Huang High School,District,2917,1910635,2,Kevin Rodriguez,M,12th,90,60,0,1
3,0,Huang High School,District,2917,1910635,3,Dr. Richard Scott,M,12th,67,58,0,0
4,0,Huang High School,District,2917,1910635,4,Bonnie Ray,F,9th,97,84,1,1


In [9]:
# Create an overview table that summarizes key metrics about each school
aggregation = {"Student ID": "count", "math_score":"mean",  'reading_score':'mean',"pass_math":"sum","pass_read":"sum"}
agg_by_school = main_tbl.groupby(["School ID","school_name","type","budget"],axis=0).agg(aggregation)

# Reset index 
agg_by_school.reset_index(inplace=True)
agg_by_school.set_index("School ID",inplace=True)

# Rename columns
renamecol =   {"Student ID": "Total Students", "math_score":"Average Math Score",  'reading_score':'Average Reading Score', \
               "school_name":"School Name","type":"School Type","budget":"Total School Budget"}
agg_by_school.rename(columns=renamecol,inplace=True)

In [10]:
# Compute Budget Per Student and percentages
agg_by_school["Per Student Budget"] = agg_by_school['Total School Budget']/agg_by_school['Total Students']
agg_by_school["% Passing Math"] = agg_by_school['pass_math']/agg_by_school['Total Students'] * 100
agg_by_school["% Passing Reading"] = agg_by_school['pass_read']/agg_by_school['Total Students'] * 100
agg_by_school["% Overall Passing Rate"] = (agg_by_school['% Passing Math'] + agg_by_school['% Passing Reading']) / 2 

# Copy agg_by_school before format. It will be used ahead.
copy_agg_by_school = agg_by_school.copy(deep=True)

# Format output.
agg_by_school["Total Students"] = agg_by_school["Total Students"].astype(float).map("{:,.0f}".format)
agg_by_school["Total School Budget"] = agg_by_school["Total School Budget"].astype(float).map("${:,.2f}".format)
agg_by_school["Per Student Budget"] = agg_by_school["Per Student Budget"].astype(float).map("${:,.2f}".format)
agg_by_school["% Passing Math"] = agg_by_school["% Passing Math"].astype(float).map("{:.2f}%".format)
agg_by_school["% Passing Reading"] = agg_by_school["% Passing Reading"].astype(float).map("{:.2f}%".format)
agg_by_school["% Overall Passing Rate"] = agg_by_school["% Overall Passing Rate"].astype(float).map("{:.3f}%".format)
agg_by_school["Average Math Score"] = agg_by_school["Average Math Score"].astype(float).map("{:,.3f}".format)
agg_by_school["Average Reading Score"] = agg_by_school["Average Reading Score"].astype(float).map("{:,.3f}".format)

# Remove index from "School ID", it will be dropped.
agg_by_school.reset_index(inplace=True)
# Drop unnecessary columns.
agg_by_school.drop(['pass_math','pass_read', 'School ID'], axis = 1, inplace = True)
# Set the index to "School Name".
agg_by_school.set_index("School Name",inplace=True)
# Organize columns position for presentation
agg_by_school = agg_by_school[["School Type","Total Students","Total School Budget","Per Student Budget","Average Math Score", \
                              "Average Reading Score","% Passing Math","% Passing Reading","% Overall Passing Rate"]]

## Top Performing Schools (By Passing Rate)

In [11]:
agg_by_school.sort_values("% Overall Passing Rate",ascending=False).head(5)

Unnamed: 0_level_0,School Type,Total Students,Total School Budget,Per Student Budget,Average Math Score,Average Reading Score,% Passing Math,% Passing Reading,% Overall Passing Rate
School Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Cabrera High School,Charter,1858,"$1,081,356.00",$582.00,83.062,83.976,94.13%,97.04%,95.587%
Thomas High School,Charter,1635,"$1,043,130.00",$638.00,83.418,83.849,93.27%,97.31%,95.291%
Pena High School,Charter,962,"$585,858.00",$609.00,83.84,84.045,94.59%,95.95%,95.270%
Griffin High School,Charter,1468,"$917,500.00",$625.00,83.351,83.817,93.39%,97.14%,95.266%
Wilson High School,Charter,2283,"$1,319,574.00",$578.00,83.274,83.989,93.87%,96.54%,95.204%


## Bottom Performing Schools (By Passing Rate)

In [12]:
agg_by_school.sort_values("% Overall Passing Rate",ascending=True).head(5)

Unnamed: 0_level_0,School Type,Total Students,Total School Budget,Per Student Budget,Average Math Score,Average Reading Score,% Passing Math,% Passing Reading,% Overall Passing Rate
School Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Rodriguez High School,District,3999,"$2,547,363.00",$637.00,76.843,80.745,66.37%,80.22%,73.293%
Figueroa High School,District,2949,"$1,884,411.00",$639.00,76.712,81.158,65.99%,80.74%,73.364%
Huang High School,District,2917,"$1,910,635.00",$655.00,76.629,81.183,65.68%,81.32%,73.500%
Johnson High School,District,4761,"$3,094,650.00",$650.00,77.072,80.966,66.06%,81.22%,73.640%
Ford High School,District,2739,"$1,763,916.00",$644.00,77.103,80.746,68.31%,79.30%,73.804%


## Math Scores by Grade

In [13]:
# Create DataFrame for Math Score average Grouping By School ID and Grade  
grade_math_score = main_tbl.pivot_table(index=["school_name"], columns='grade', values='math_score', aggfunc='mean') 

# Sort Grade and rename titles name for presentation.
columns = ['9th', '10th','11th','12th']
grade_math_score = grade_math_score[columns]
grade_math_score.columns.name = 'Grade'
grade_math_score.index.name = 'School Name'

# Format before output
grade_math_score["9th"] = grade_math_score["9th"].astype(float).map("{:,.3f}".format)
grade_math_score["10th"] = grade_math_score["10th"].astype(float).map("{:,.3f}".format)
grade_math_score["11th"] = grade_math_score["11th"].astype(float).map("{:,.3f}".format)
grade_math_score["12th"] = grade_math_score["12th"].astype(float).map("{:,.3f}".format)
grade_math_score

Grade,9th,10th,11th,12th
School Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Bailey High School,77.084,76.997,77.516,76.492
Cabrera High School,83.095,83.155,82.766,83.277
Figueroa High School,76.403,76.54,76.884,77.151
Ford High School,77.361,77.672,76.918,76.18
Griffin High School,82.044,84.229,83.842,83.356
Hernandez High School,77.438,77.337,77.136,77.187
Holden High School,83.787,83.43,85.0,82.855
Huang High School,77.027,75.909,76.447,77.226
Johnson High School,77.188,76.691,77.492,76.863
Pena High School,83.625,83.372,84.328,84.122


## Reading Scores by Grade

In [14]:
# Create DataFrame for Reading Score average Grouping By School ID and Grade  
grade_reading_score = main_tbl.pivot_table(index=["school_name"], columns='grade', values='reading_score', aggfunc='mean') 

# Sort Grade and rename titles name for presentation.
columns = ['9th', '10th','11th','12th']
grade_reading_score = grade_reading_score[columns]
grade_reading_score.columns.name = 'Grade'
grade_reading_score.index.name = 'School Name'

# Format before output
grade_reading_score["9th"] = grade_reading_score["9th"].astype(float).map("{:,.3f}".format)
grade_reading_score["10th"] = grade_reading_score["10th"].astype(float).map("{:,.3f}".format)
grade_reading_score["11th"] = grade_reading_score["11th"].astype(float).map("{:,.3f}".format)
grade_reading_score["12th"] = grade_reading_score["12th"].astype(float).map("{:,.3f}".format)
grade_reading_score

Grade,9th,10th,11th,12th
School Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Bailey High School,81.303,80.907,80.946,80.912
Cabrera High School,83.676,84.253,83.788,84.288
Figueroa High School,81.199,81.409,80.64,81.385
Ford High School,80.633,81.263,80.404,80.662
Griffin High School,83.369,83.707,84.288,84.014
Hernandez High School,80.867,80.66,81.396,80.857
Holden High School,83.677,83.325,83.816,84.699
Huang High School,81.29,81.512,81.417,80.306
Johnson High School,81.261,80.773,80.616,81.228
Pena High School,83.807,83.612,84.336,84.591


## Scores by School Spending

In [15]:
# Create bins and labels group.
bins = [0, 585, 615, 645, 675]
groups = ["<$585", "$585-615", "$615-645", "$645-675"]

# Add a column based on average Spending Ranges on df grouped by school.
copy_agg_by_school["Spending Ranges (Per Student)"] = pd.cut(copy_agg_by_school["Per Student Budget"], bins, labels=groups)

# Create new df wih aggregation to compute numbers by Spending Ranges (Per Student)
aggregation = {'Average Math Score':'mean', 'Average Reading Score':"mean", "% Passing Math":"mean", \
               "% Passing Reading":"mean",  "% Overall Passing Rate":"mean" }
scores_school_spending = copy_agg_by_school.groupby("Spending Ranges (Per Student)").agg(aggregation) 

# Format to presentation
scores_school_spending["% Passing Math"] = scores_school_spending["% Passing Math"].astype(float).map("{:.2f}%".format)
scores_school_spending["% Passing Reading"] = scores_school_spending["% Passing Reading"].astype(float).map("{:.2f}%".format)
scores_school_spending["% Overall Passing Rate"] = scores_school_spending["% Overall Passing Rate"].astype(float) \
.map("{:.2f}%".format)
scores_school_spending["Average Math Score"] = scores_school_spending["Average Math Score"].astype(float).map("{:,.3f}".format)
scores_school_spending["Average Reading Score"] = scores_school_spending["Average Reading Score"].astype(float) \
.map("{:,.3f}".format)
scores_school_spending

Unnamed: 0_level_0,Average Math Score,Average Reading Score,% Passing Math,% Passing Reading,% Overall Passing Rate
Spending Ranges (Per Student),Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
<$585,83.455,83.934,93.46%,96.61%,95.04%
$585-615,83.6,83.885,94.23%,95.90%,95.07%
$615-645,79.079,81.891,75.67%,86.11%,80.89%
$645-675,76.997,81.028,66.16%,81.13%,73.65%


## Scores by School Size 

In [16]:
# Create bins and labels group.
bins = [0, 1000, 2000, 5000]
groups = ["Small (<1000)", "Medium (1000-2000)", "Large (2000-5000)"]

# Add a column based on School Size (Total Students) on df grouped by school.
copy_agg_by_school["School Size"] = pd.cut(copy_agg_by_school["Total Students"], bins, labels=groups)

# Create new df wih aggregation to compute numbers by School Size
aggregation = {'Average Math Score':'mean', 'Average Reading Score':"mean", "% Passing Math":"mean", \
               "% Passing Reading":"mean",  "% Overall Passing Rate":"mean" }
scores_school_size = copy_agg_by_school.groupby("School Size").agg(aggregation) 

# Format to presentation
scores_school_size["% Passing Math"] = scores_school_size["% Passing Math"].astype(float).map("{:.2f}%".format)
scores_school_size["% Passing Reading"] = scores_school_size["% Passing Reading"].astype(float).map("{:.2f}%".format)
scores_school_size["% Overall Passing Rate"] = scores_school_size["% Overall Passing Rate"].astype(float) \
.map("{:.2f}%".format)
scores_school_size["Average Math Score"] = scores_school_size["Average Math Score"].astype(float).map("{:,.3f}".format)
scores_school_size["Average Reading Score"] = scores_school_size["Average Reading Score"].astype(float).map("{:,.3f}".format)
scores_school_size

Unnamed: 0_level_0,Average Math Score,Average Reading Score,% Passing Math,% Passing Reading,% Overall Passing Rate
School Size,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Small (<1000),83.822,83.93,93.55%,96.10%,94.82%
Medium (1000-2000),83.375,83.864,93.60%,96.79%,95.20%
Large (2000-5000),77.746,81.344,69.96%,82.77%,76.36%


## Scores by School Type

In [17]:
# Create new df wih aggregation to compute numbers by School Type
aggregation = {'Average Math Score':'mean', 'Average Reading Score':"mean", "% Passing Math":"mean", \
               "% Passing Reading":"mean",  "% Overall Passing Rate":"mean" }
scores_school_type = copy_agg_by_school.groupby("School Type").agg(aggregation) 

# Format to presentation
scores_school_type["% Passing Math"] = scores_school_type["% Passing Math"].astype(float).map("{:.2f}%".format)
scores_school_type["% Passing Reading"] = scores_school_type["% Passing Reading"].astype(float).map("{:.2f}%".format)
scores_school_type["% Overall Passing Rate"] = scores_school_type["% Overall Passing Rate"].astype(float).map("{:.2f}%".format)
scores_school_type["Average Math Score"] = scores_school_type["Average Math Score"].astype(float).map("{:,.3f}".format)
scores_school_type["Average Reading Score"] = scores_school_type["Average Reading Score"].astype(float).map("{:,.3f}".format)

scores_school_type

Unnamed: 0_level_0,Average Math Score,Average Reading Score,% Passing Math,% Passing Reading,% Overall Passing Rate
School Type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Charter,83.474,83.896,93.62%,96.59%,95.10%
District,76.957,80.967,66.55%,80.80%,73.67%
