# Pandas Challenge - PyCity Schools

Martin Singla - Data Analyst 

Data Analytics Bootcamp - Tecnológico de Monterrey / Trinity Education

In the following Jupyter notebook we utilize Python's Pandas dependendency to analyze education trends at a school district level. 

### Loading & data wrangling 

In [32]:
#Load dependencies and data
import pandas as pd

schools = pd.read_csv("Resources/schools_complete.csv", encoding = "utf-8")
students = pd.read_csv("Resources/students_complete.csv", encoding = "utf-8")

In [33]:
#Checking data frames: Schools
schools.count() #check for NaN
schools.head() #14 schools with its name, type, size and budget

Unnamed: 0,School ID,school_name,type,size,budget
0,0,Huang High School,District,2917,1910635
1,1,Figueroa High School,District,2949,1884411
2,2,Shelton High School,Charter,1761,1056600
3,3,Hernandez High School,District,4635,3022020
4,4,Griffin High School,Charter,1468,917500


In [34]:
#Checking data frames: Students
students.count()
students.head() #39,170 students w/ its name, gender, grade, school, reading and math scores.

Unnamed: 0,Student ID,student_name,gender,grade,school_name,reading_score,math_score
0,0,Paul Bradley,M,9th,Huang High School,66,79
1,1,Victor Smith,M,12th,Huang High School,94,61
2,2,Kevin Rodriguez,M,12th,Huang High School,90,60
3,3,Dr. Richard Scott,M,12th,Huang High School,67,58
4,4,Bonnie Ray,F,9th,Huang High School,97,84


In [36]:
#Merging datasets
data = pd.merge(students, schools, on= "school_name", how= "left")
data = data.rename(columns= {"size" : "Tot.Students.in.School",
                             "budget" : "Tot.School.Budget"})
data.head()

Unnamed: 0,Student ID,student_name,gender,grade,school_name,reading_score,math_score,School ID,type,Tot.Students.in.School,Tot.School.Budget
0,0,Paul Bradley,M,9th,Huang High School,66,79,0,District,2917,1910635
1,1,Victor Smith,M,12th,Huang High School,94,61,0,District,2917,1910635
2,2,Kevin Rodriguez,M,12th,Huang High School,90,60,0,District,2917,1910635
3,3,Dr. Richard Scott,M,12th,Huang High School,67,58,0,District,2917,1910635
4,4,Bonnie Ray,F,9th,Huang High School,97,84,0,District,2917,1910635


### District Level Summary Statistics

In [37]:
#District Education statistics summary 
df1 = pd.DataFrame({
    "Total Schools": [schools["school_name"].count()] ,
    "Total Students": [students["student_name"].count()], 
    "Total Budget": [f'${schools["budget"].sum()}'], 
    "Avg.Math Score": [round(students["math_score"].mean(), 2)], 
    "Avg.Reading Score": [round(students["reading_score"].mean(), 2)], 
    "% Passing Math": [f'% {round(students.loc[students["math_score"] >= 70,"math_score"].count() / students["math_score"].count() * 100, 2)}'], 
    "% Passing Reading": [f'% {round(students.loc[students["reading_score"] >= 70,"reading_score"].count() / students["math_score"].count() * 100, 2)}'], 
    "% Overall Passing": [f'% {round(students.loc[(students["reading_score"] >= 70) & (students["math_score"] >= 70), "Student ID"].count() / students["Student ID"].count()*100,2)}']
})

df1

Unnamed: 0,Total Schools,Total Students,Total Budget,Avg.Math Score,Avg.Reading Score,% Passing Math,% Passing Reading,% Overall Passing
0,15,39170,$24649428,78.99,81.88,% 74.98,% 85.81,% 65.17


__Top level summary statistics indicate that students are perfroming better at reading classes than math classes. From my own experience back in high school, I would consider this trend accurate!__

### School Level Summary Statistics

In [78]:
#Summary statistics at a school level
data_group = data.groupby("school_name")
df2 = pd.DataFrame({
    "School Type": data_group["type"].first(),
    "Total Students": data_group["Tot.Students.in.School"].first(),
    "Total School Budget $": data_group["Tot.School.Budget"].first(),
    "Per Student Budget $": data_group["Tot.School.Budget"].first() / data_group["Tot.Students.in.School"].first(),
    "Average Math Score": round(data_group["math_score"].mean(), 2),
    "Average Reading Score": round(data_group["reading_score"].mean(), 2),
    "% Passing Math": round(data.loc[data["math_score"] >= 70]
                            .groupby("school_name")["math_score"]
                            .count() / data_group["Tot.Students.in.School"].first() * 100, 2),
    "% Passing Reading": round(data.loc[data["reading_score"] >= 70].groupby("school_name")["reading_score"].count() / data_group["Tot.Students.in.School"].first() * 100, 2),
    "% Overall Passing": round(data.loc[(data["math_score"] >= 70) & (data["reading_score"] >= 70)]
                               .groupby("school_name")["math_score"]
                               .count() / data_group["Tot.Students.in.School"].first() * 100, 2)
})
df2

Unnamed: 0_level_0,School Type,Total Students,Total School Budget $,Per Student Budget $,Average Math Score,Average Reading Score,% Passing Math,% Passing Reading,% Overall Passing
school_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Bailey High School,District,4976,3124928,628.0,77.05,81.03,66.68,81.93,54.64
Cabrera High School,Charter,1858,1081356,582.0,83.06,83.98,94.13,97.04,91.33
Figueroa High School,District,2949,1884411,639.0,76.71,81.16,65.99,80.74,53.2
Ford High School,District,2739,1763916,644.0,77.1,80.75,68.31,79.3,54.29
Griffin High School,Charter,1468,917500,625.0,83.35,83.82,93.39,97.14,90.6
Hernandez High School,District,4635,3022020,652.0,77.29,80.93,66.75,80.86,53.53
Holden High School,Charter,427,248087,581.0,83.8,83.81,92.51,96.25,89.23
Huang High School,District,2917,1910635,655.0,76.63,81.18,65.68,81.32,53.51
Johnson High School,District,4761,3094650,650.0,77.07,80.97,66.06,81.22,53.54
Pena High School,Charter,962,585858,609.0,83.84,84.04,94.59,95.95,90.54


__There are some clear differences between schools with a very high % of overall passing students (above 90%) and some schools with very bad perfromance (below 55%). Actually, there are not many schools with overall % of passing students between 55% and 90%. School performance in the district is very polarized!! Maybe some correlation statistics and regression analysis using budget per student data will help further understand this trend.__

### Top & Worst Performing Schools

In [85]:
#Top 5 Performing Schools in terms of "% Overall passing students"
df2.sort_values("% Overall Passing", ascending= False).head(5)

Unnamed: 0_level_0,School Type,Total Students,Total School Budget $,Per Student Budget $,Average Math Score,Average Reading Score,% Passing Math,% Passing Reading,% Overall Passing
school_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Cabrera High School,Charter,1858,1081356,582.0,83.06,83.98,94.13,97.04,91.33
Thomas High School,Charter,1635,1043130,638.0,83.42,83.85,93.27,97.31,90.95
Griffin High School,Charter,1468,917500,625.0,83.35,83.82,93.39,97.14,90.6
Wilson High School,Charter,2283,1319574,578.0,83.27,83.99,93.87,96.54,90.58
Pena High School,Charter,962,585858,609.0,83.84,84.04,94.59,95.95,90.54


In [87]:
#Worst 5 Performing Schools in terms of "% Overall passing students"
df2.sort_values("% Overall Passing", ascending= True).head(5)

Unnamed: 0_level_0,School Type,Total Students,Total School Budget $,Per Student Budget $,Average Math Score,Average Reading Score,% Passing Math,% Passing Reading,% Overall Passing
school_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Rodriguez High School,District,3999,2547363,637.0,76.84,80.74,66.37,80.22,52.99
Figueroa High School,District,2949,1884411,639.0,76.71,81.16,65.99,80.74,53.2
Huang High School,District,2917,1910635,655.0,76.63,81.18,65.68,81.32,53.51
Hernandez High School,District,4635,3022020,652.0,77.29,80.93,66.75,80.86,53.53
Johnson High School,District,4761,3094650,650.0,77.07,80.97,66.06,81.22,53.54


### Average Math & Reading Scores By Grade & School

In [97]:
pd.unique(data["grade"]) #check grades listed = 9th, 10th, 11th and 12th

#Average Math Scores by grade and school
df3 = pd.DataFrame({
    "9th Grade": round(data.loc[data["grade"] == "9th"].groupby("school_name")["math_score"].mean(), 2),
    "10th Grade": round(data.loc[data["grade"] == "10th"].groupby("school_name")["math_score"].mean(), 2),
    "11th Grade": round(data.loc[data["grade"] == "11th"].groupby("school_name")["math_score"].mean(), 2),
    "12th Grade": round(data.loc[data["grade"] == "12th"].groupby("school_name")["math_score"].mean(), 2)
})
df3

Unnamed: 0_level_0,9th Grade,10th Grade,11th Grade,12th Grade
school_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Bailey High School,77.08,77.0,77.52,76.49
Cabrera High School,83.09,83.15,82.77,83.28
Figueroa High School,76.4,76.54,76.88,77.15
Ford High School,77.36,77.67,76.92,76.18
Griffin High School,82.04,84.23,83.84,83.36
Hernandez High School,77.44,77.34,77.14,77.19
Holden High School,83.79,83.43,85.0,82.86
Huang High School,77.03,75.91,76.45,77.23
Johnson High School,77.19,76.69,77.49,76.86
Pena High School,83.63,83.37,84.33,84.12


In [99]:
#Average Reading Scores by grade and school
df3 = pd.DataFrame({
    "9th Grade": round(data.loc[data["grade"] == "9th"].groupby("school_name")["reading_score"].mean(), 2),
    "10th Grade": round(data.loc[data["grade"] == "10th"].groupby("school_name")["reading_score"].mean(), 2),
    "11th Grade": round(data.loc[data["grade"] == "11th"].groupby("school_name")["reading_score"].mean(), 2),
    "12th Grade": round(data.loc[data["grade"] == "12th"].groupby("school_name")["reading_score"].mean(), 2)
})
df3

Unnamed: 0_level_0,9th Grade,10th Grade,11th Grade,12th Grade
school_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Bailey High School,81.3,80.91,80.95,80.91
Cabrera High School,83.68,84.25,83.79,84.29
Figueroa High School,81.2,81.41,80.64,81.38
Ford High School,80.63,81.26,80.4,80.66
Griffin High School,83.37,83.71,84.29,84.01
Hernandez High School,80.87,80.66,81.4,80.86
Holden High School,83.68,83.32,83.82,84.7
Huang High School,81.29,81.51,81.42,80.31
Johnson High School,81.26,80.77,80.62,81.23
Pena High School,83.81,83.61,84.34,84.59
