PyCity School Analysis
February 8, 2019
Scott McEachern

In [1]:
#- Load Datasets into DataFrame
import os
import pandas as pd


# Schools DataFrame
schoolPath = os.path.join(".", "Resources", "schools_complete.csv")

school_df = pd.read_csv(schoolPath)


# Student DataFrame
studentPath = os.path.join(".", "Resources", "students_complete.csv")

student_df = pd.read_csv(studentPath)

In [2]:
#-- District Summary

#- Calculate Total Schools
totalNumSchools = school_df.shape[0]


#- Calculate Total Students
totalNumStudents = student_df.shape[0]
totalNumStudentsFormat = f"{'{:,}'.format(totalNumStudents)}"


#- Total School Budget
totalSchoolBudget = school_df["budget"].sum()
totalSchoolBudgetFormat = f"${'{:,.2f}'.format(totalSchoolBudget)}"


#- Average Math Score
aveMathScore = student_df["math_score"].mean()
aveMathScoreFormat = f"{'{:,.6f}'.format(aveMathScore)}"


#- Average Reading Score
aveReadingScore = student_df["reading_score"].mean()
aveReadingScoreFormat = f"{'{:,.6f}'.format(aveReadingScore)}"


#- Overall Passing Rate
overallAveScore = (aveMathScore + aveReadingScore) / 2
overallAveScoreFormat = f"{'{:,.6f}'.format(overallAveScore)}"


#- Percentage students with passing math score 
totalStudentsPassingMath = student_df[student_df['math_score'] >= 70].shape[0]
percentStudentsPassingMath = (totalStudentsPassingMath / totalNumStudents) * 100

percentStudentsPassingMathFormat = f"{'{:.6f}'.format(percentStudentsPassingMath)}"


#- Percentage students with passing reading score
totalStudentsPassingReading = student_df[student_df['reading_score'] >= 70].shape[0]
percentStudentsPassingReading = (totalStudentsPassingReading / totalNumStudents) * 100

percentStudentsPassingReadingFormat = f"{'{:.6f}'.format(percentStudentsPassingReading)}"


# Result Dataframe
districtSummaryResults = {
    'Total Schools': [totalNumSchools],
    'Total Students': [totalNumStudentsFormat],
    'Total Budget': [totalSchoolBudgetFormat],
    'Average Match Score': [aveMathScoreFormat],
    'Average Reading Score': [aveReadingScoreFormat],
    '% Passing Math': [percentStudentsPassingMathFormat],
    '% Passing Reading': [percentStudentsPassingReadingFormat],
    '% Overall Passing Rate': [overallAveScoreFormat]
                }

districtSummary_df = pd.DataFrame(districtSummaryResults)

#print(percentStudentsPassingMatchFormat)

districtSummary_df

Unnamed: 0,Total Schools,Total Students,Total Budget,Average Match Score,Average Reading Score,% Passing Math,% Passing Reading,% Overall Passing Rate
0,15,39170,"$24,649,428.00",78.985371,81.87784,74.980853,85.805463,80.431606


In [6]:
#-- School Summary
# Summarize the student data by the school; creates a DataFrame to store the information.

#- Merge into single dataframe
studentWithSchool_df = pd.merge(student_df, school_df, how="left", on=["school_name", "school_name"])


#- Group by the school name
studentGroupBySchool_dfGroupBy = studentWithSchool_df.groupby('school_name')


#- Create Dictionary to store summary information
schoolSummaryData = {
    'School Name': [],
    'School Type': [],
    'Total Students': [],
    'Total School Budget': [],
    'Per Student Budget': [],
    'Average Math Score': [],
    'Average Reading Score': [],
    '% Passing Math': [],
    '% Passing Reading': [],
    '% Overall Passing Rate': []
    }


#- Calculate Summary for each school
#  Loop through the groupby and calculate the summary for each school
for schoolName, groupedSchool_df in studentGroupBySchool_dfGroupBy:
    
    # Set School Name
    schoolSummaryData['School Name'].append(schoolName)
    
    
    # Set School Type
    schoolSummaryData['School Type'].append(groupedSchool_df.iloc[0]['type'] )
    
    
    # Calculate Total Students
    schoolStudentTotal = groupedSchool_df.shape[0]
    schoolSummaryData['Total Students'].append(schoolStudentTotal)

    
    # Total School Budget
    schoolBudget = groupedSchool_df.iloc[0]['budget']
    schoolSummaryData['Total School Budget'].append(schoolBudget)
#TODO - format that school budget before adding to data frame


    # Average Student Budget
    perStudentBudget = (schoolBudget/schoolStudentTotal)
    schoolSummaryData['Per Student Budget'].append(perStudentBudget)
    
    
    # Average Math Score
    schoolAveMathScore = groupedSchool_df['math_score'].mean()
    schoolSummaryData['Average Math Score'].append(schoolAveMathScore)
    
    
    # Average Reading Score
    schoolAveReadingScore = groupedSchool_df['reading_score'].mean()
    schoolSummaryData['Average Reading Score'].append(schoolAveReadingScore)                                                            

    
    # Percent Passing Math
    schoolStudentsPassingMath = groupedSchool_df[groupedSchool_df['math_score'] >= 70].shape[0]
    schoolPercentPassingMatch = (schoolStudentsPassingMath / schoolStudentTotal) * 100
    
    schoolSummaryData['% Passing Math'].append(schoolPercentPassingMatch)
    
    
    # Percent Passing Reading
    schoolStudentsPassingReading = groupedSchool_df[groupedSchool_df['reading_score'] >= 70].shape[0]
    schoolPercentPassingReading = (schoolStudentsPassingReading / schoolStudentTotal) * 100
    
    schoolSummaryData['% Passing Reading'].append(schoolPercentPassingReading)
    
    
    # Overall Passing Rate
    schoolOverallPassingRate = (schoolPercentPassingMatch + schoolPercentPassingReading) / 2
    
    schoolSummaryData['% Overall Passing Rate'].append(schoolOverallPassingRate)
    
    
#- Create DataFrame of results
schoolSummary_df = pd.DataFrame(schoolSummaryData)



In [8]:
#-- Top Performing Schools

#- Sort by Passing Rate
topSchoolSummary_df = schoolSummary_df.sort_values(by=['% Overall Passing Rate'], ascending=False)


#- Create Dataframe contains top 5
#  Dateframe created rather than using Head functionality
top5SchoolSummary_df = topSchoolSummary_df[:5]


#- Display Top Schools
top5SchoolSummary_df.head()

Unnamed: 0,School Name,School Type,Total Students,Total School Budget,Per Student Budget,Average Math Score,Average Reading Score,% Passing Math,% Passing Reading,% Overall Passing Rate
1,Cabrera High School,Charter,1858,1081356,582.0,83.061895,83.97578,94.133477,97.039828,95.586652
12,Thomas High School,Charter,1635,1043130,638.0,83.418349,83.84893,93.272171,97.308869,95.29052
9,Pena High School,Charter,962,585858,609.0,83.839917,84.044699,94.594595,95.945946,95.27027
4,Griffin High School,Charter,1468,917500,625.0,83.351499,83.816757,93.392371,97.138965,95.265668
13,Wilson High School,Charter,2283,1319574,578.0,83.274201,83.989488,93.867718,96.539641,95.203679


In [9]:
#-- Bottom Performing Schools

#- Sort by Passing Rate
bottomSchoolSummary_df = schoolSummary_df.sort_values(by=['% Overall Passing Rate'], ascending=True)


#- Create Dataframe with bottom 5
bottom5SchoolSummary_df = bottomSchoolSummary_df[:5]


#- Display Bottom Schools
bottom5SchoolSummary_df.head()

Unnamed: 0,School Name,School Type,Total Students,Total School Budget,Per Student Budget,Average Math Score,Average Reading Score,% Passing Math,% Passing Reading,% Overall Passing Rate
10,Rodriguez High School,District,3999,2547363,637.0,76.842711,80.744686,66.366592,80.220055,73.293323
2,Figueroa High School,District,2949,1884411,639.0,76.711767,81.15802,65.988471,80.739234,73.363852
7,Huang High School,District,2917,1910635,655.0,76.629414,81.182722,65.683922,81.316421,73.500171
8,Johnson High School,District,4761,3094650,650.0,77.072464,80.966394,66.057551,81.222432,73.639992
3,Ford High School,District,2739,1763916,644.0,77.102592,80.746258,68.309602,79.299014,73.804308
