In [1]:
#Module 4 Challenge
#by: Nathaniel Mizzell

In [2]:
#Import dependencies
import pandas as pd
from pathlib import Path

In [3]:
#Get data
school_df_path = Path("Resources\schools_complete.csv")
student_df_path = Path("Resources\students_complete.csv")

school_df = pd.read_csv(school_df_path)
student_df = pd.read_csv(student_df_path)

#Merge dataframes for analysis
df = pd.merge(student_df, school_df, on='school_name', how='left')

df.columns

Index(['Student ID', 'student_name', 'gender', 'grade', 'school_name',
       'reading_score', 'math_score', 'School ID', 'type', 'size', 'budget'],
      dtype='object')

In [4]:
#Analysis

In [5]:
#District Summary

#Total number of unique schools
unique_school_count = school_df['school_name'].nunique()

#Total Students
total_students = school_df['size'].sum()

#Total Budget
total_budget = school_df['budget'].sum()

#Average math score
avg_math_score = df['math_score'].mean()

#Average reading score
avg_reading_score = df['reading_score'].mean()

#PCT passing math (>= 70)
num_passing_students_math = df['math_score'].loc[df['math_score'] >= 70].count()
pct_passing_math = num_passing_students_math / float(total_students)

#PCT passing reading
num_passing_students_reading = df['reading_score'].loc[df['reading_score'] >= 70].count()
pct_passing_reading = num_passing_students_reading / total_students

#PCT overall passing
num_passing_students_overall = df['reading_score'].loc[(df['reading_score'] >= 70)
                                                        & (df['math_score'] >= 70)].count()
pct_passing_overall = num_passing_students_overall / total_students


#return results in a pandas series
results = pd.Series({
    'Total number of unique schools': unique_school_count
    ,'Total Students': total_students
    ,'Total Budget': total_budget
    ,'Average math score': avg_math_score
    ,'Average reading score': avg_reading_score
    ,'PCT passing math (>= 70)': pct_passing_math * 100
    ,'PCT passing reading': pct_passing_reading * 100
    ,'PCT overall passing': pct_passing_overall * 100
}).map("{:,.2f}".format)

results

Total number of unique schools            15.00
Total Students                        39,170.00
Total Budget                      24,649,428.00
Average math score                        78.99
Average reading score                     81.88
PCT passing math (>= 70)                  74.98
PCT passing reading                       85.81
PCT overall passing                       65.17
dtype: object

In [6]:
#School summary

#create results DF
results_df = pd.DataFrame()

#add a column to the ungrouped df. this will allow a sum aggregation to be used...
#to find the total passing students.
#set default value to 0. since we take the sum later, only passing students get 1
df['Passed Reading'] = 0
df['Passed Math'] = 0
df['Passed Overall'] = 0

#set the value of passed reading to 1 if the student passed
df['Passed Reading'].loc[df['reading_score'] >= 70] = 1
df['Passed Math'].loc[df['math_score'] >= 70] = 1
df['Passed Overall'].loc[(df['reading_score'] >= 70) & (df['math_score'] >= 70)] = 1

#Strategy: group data by school 
grouped_df = df.groupby(by='school_name')

#School name
results_df['School Name'] = grouped_df['school_name'].first()

#School type
results_df['School Type'] = grouped_df['type'].first()

#Total students
results_df['Total Students'] = grouped_df['student_name'].count()

#Total school budget
results_df['Total Budget'] = grouped_df['budget'].mean()

#Per student budget
results_df['Budget per Student'] = results_df['Total Budget'] / results_df['Total Students']

#Average math score
results_df['Avg Math Score'] = grouped_df['math_score'].mean()

#Average reading score
results_df['Avg Reading Score'] = grouped_df['reading_score'].mean()

#PCT passing math
results_df['Total Passing Math'] = grouped_df['Passed Math'].sum()
results_df['PCT Passing Math'] = results_df['Total Passing Math'] / results_df['Total Students']

#PCT passing reading
results_df['Total Passing Reading'] = grouped_df['Passed Reading'].sum()
results_df['PCT Passing Reading'] = results_df['Total Passing Reading'] / results_df['Total Students']

#PCT overall passing
results_df['Total Passing Overall'] = grouped_df['Passed Overall'].sum()
results_df['PCT Passing Overall'] = results_df['Total Passing Overall'] / results_df['Total Students']

results_df = results_df.drop(['Total Passing Overall', 'Total Passing Reading', 'Total Passing Math'], axis=1)

results_df

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value)


Unnamed: 0_level_0,School Name,School Type,Total Students,Total Budget,Budget per Student,Avg Math Score,Avg Reading Score,PCT Passing Math,PCT Passing Reading,PCT Passing Overall
school_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Bailey High School,Bailey High School,District,4976,3124928,628.0,77.048432,81.033963,0.666801,0.819333,0.546423
Cabrera High School,Cabrera High School,Charter,1858,1081356,582.0,83.061895,83.97578,0.941335,0.970398,0.913348
Figueroa High School,Figueroa High School,District,2949,1884411,639.0,76.711767,81.15802,0.659885,0.807392,0.532045
Ford High School,Ford High School,District,2739,1763916,644.0,77.102592,80.746258,0.683096,0.79299,0.542899
Griffin High School,Griffin High School,Charter,1468,917500,625.0,83.351499,83.816757,0.933924,0.97139,0.905995
Hernandez High School,Hernandez High School,District,4635,3022020,652.0,77.289752,80.934412,0.66753,0.80863,0.535275
Holden High School,Holden High School,Charter,427,248087,581.0,83.803279,83.814988,0.925059,0.962529,0.892272
Huang High School,Huang High School,District,2917,1910635,655.0,76.629414,81.182722,0.656839,0.813164,0.535139
Johnson High School,Johnson High School,District,4761,3094650,650.0,77.072464,80.966394,0.660576,0.812224,0.535392
Pena High School,Pena High School,Charter,962,585858,609.0,83.839917,84.044699,0.945946,0.959459,0.905405


In [16]:
#Highest performing Schools by PCT overall passing
top_schools = results_df.sort_values(by='PCT Passing Overall', ascending=False).iloc[0:5,:]
top_schools

Unnamed: 0_level_0,School Name,School Type,Total Students,Total Budget,Budget per Student,Avg Math Score,Avg Reading Score,PCT Passing Math,PCT Passing Reading,PCT Passing Overall
school_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Cabrera High School,Cabrera High School,Charter,1858,1081356,582.0,83.061895,83.97578,0.941335,0.970398,0.913348
Thomas High School,Thomas High School,Charter,1635,1043130,638.0,83.418349,83.84893,0.932722,0.973089,0.90948
Griffin High School,Griffin High School,Charter,1468,917500,625.0,83.351499,83.816757,0.933924,0.97139,0.905995
Wilson High School,Wilson High School,Charter,2283,1319574,578.0,83.274201,83.989488,0.938677,0.965396,0.905826
Pena High School,Pena High School,Charter,962,585858,609.0,83.839917,84.044699,0.945946,0.959459,0.905405


In [18]:
#Lowest performing schools by PCT overall passing
bottom_schools = results_df.sort_values(by='PCT Passing Overall', ascending=True).iloc[0:5,:]
bottom_schools

Unnamed: 0_level_0,School Name,School Type,Total Students,Total Budget,Budget per Student,Avg Math Score,Avg Reading Score,PCT Passing Math,PCT Passing Reading,PCT Passing Overall
school_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Rodriguez High School,Rodriguez High School,District,3999,2547363,637.0,76.842711,80.744686,0.663666,0.802201,0.529882
Figueroa High School,Figueroa High School,District,2949,1884411,639.0,76.711767,81.15802,0.659885,0.807392,0.532045
Huang High School,Huang High School,District,2917,1910635,655.0,76.629414,81.182722,0.656839,0.813164,0.535139
Hernandez High School,Hernandez High School,District,4635,3022020,652.0,77.289752,80.934412,0.66753,0.80863,0.535275
Johnson High School,Johnson High School,District,4761,3094650,650.0,77.072464,80.966394,0.660576,0.812224,0.535392


In [21]:
#Average math Score by Grade
grouped_df = df.groupby(by='grade')

avg_math_score_by_grade = pd.Series(grouped_df['math_score'].mean())
avg_math_score_by_grade

grade
10th    78.941483
11th    79.083548
12th    78.993164
9th     78.935659
Name: math_score, dtype: float64

In [22]:
#Reading scores by grade
grouped_df = df.groupby(by='grade')

avg_reading_score_by_grade = pd.Series(grouped_df['reading_score'].mean())
avg_reading_score_by_grade

grade
10th    81.874410
11th    81.885714
12th    81.819851
9th     81.914358
Name: reading_score, dtype: float64

In [11]:
#Scores by school spending


In [12]:
#Scores by school type