In [2]:
# Import pandas
import pandas as pd

In [3]:
# Save path to and set variables for data sets to be used
schools_data = "Resources/schools_complete.csv"

students_data = "Resources/students_complete.csv"

schools_data

'Resources/schools_complete.csv'

In [50]:
# Use pandas to read data, define individual data frames and merge to create combined dataframe
schools_data_df = pd.read_csv(schools_data)

students_data_df = pd.read_csv(students_data)

combined_data_df = pd.merge(schools_data_df, students_data_df, how="left", on=["school_name","school_name"])

## Local Government Area Summary

In [5]:
# Calculate total number of unqiue schools
unq_schools = schools_data_df["school_name"].unique()
unq_schools_count = len(schools_data_df)
unq_schools


array(['Huang High School', 'Figueroa High School', 'Shelton High School',
       'Hernandez High School', 'Griffin High School',
       'Wilson High School', 'Cabrera High School', 'Bailey High School',
       'Holden High School', 'Pena High School', 'Wright High School',
       'Rodriguez High School', 'Johnson High School', 'Ford High School',
       'Thomas High School'], dtype=object)

In [6]:
# Calculate total number of students
student_count = schools_data_df["size"].sum()
student_count

39170

In [7]:
# Calculate total school budget for local government area
total_budget = schools_data_df["budget"].sum()
total_budget

24649428

In [8]:
# Calcualte average maths score
mathsscores_df = students_data_df["maths_score"]

avg_mathsscore = mathsscores_df.mean()

avg_mathsscore


70.33819249425581

In [9]:
# Calculate average reading score
readingscores_df = students_data_df["reading_score"]

avg_readingscore = readingscores_df.mean()

avg_readingscore

69.98013786060761

In [10]:
# Calculate % of students who passed maths

# New df that includes only those that passed maths
pass_maths_df = students_data_df.loc[students_data_df["maths_score"]>=50]

# Count of maths passers over total student count
pct_pass_maths = ((pass_maths_df["maths_score"].count())/student_count)*100

pct_pass_maths


86.07863160582077

In [11]:
# Calculate % of students who passed reading

# New df that includes only those that passed reading
pass_reading_df = students_data_df.loc[students_data_df["reading_score"]>=50]

# Count of reading passers over total student count
pct_pass_reading = ((pass_reading_df["reading_score"].count())/student_count)*100

pct_pass_reading

84.42685728874139

In [12]:
# Calculate % of students who passed both maths and reading

# New df that includes only those that passed both maths and reading
pass_both_df = pass_maths_df.loc[pass_maths_df["reading_score"]>=50]

# Count of maths and reading passers over total student count
pct_pass_both = ((pass_both_df["reading_score"].count())/student_count)*100

pct_pass_both

72.80827163645647

In [13]:
# Create LGA Summary dataframe
lga_summary_df = pd.DataFrame({
    "Unique Schools": [unq_schools_count],
    "Total Students": ["{:,}".format(student_count)],
    "Total Budget": ["${:,}".format(total_budget)],
    "Average Maths Score": ['{:.3f}%'.format(avg_mathsscore)],
    "Average Reading Score": ['{:.3f}%'.format(avg_readingscore)],
    "% Passing Maths":['{:.3f}%'.format(pct_pass_maths)],
    "% Passing Reading": ['{:.3f}%'.format(pct_pass_reading)],
    "% Overall Passing": ['{:.3f}%'.format(pct_pass_both)]
})

lga_summary_df

Unnamed: 0,Unique Schools,Total Students,Total Budget,Average Maths Score,Average Reading Score,% Passing Maths,% Passing Reading,% Overall Passing
0,15,39170,"$24,649,428",70.338%,69.980%,86.079%,84.427%,72.808%


## School Summary

In [14]:
# Drop columns not being used from Combined df to create new reduced df
combined_data_reduced_df = combined_data_df.drop('School ID', axis=1)
combined_data_reduced_df = combined_data_reduced_df.drop('Student ID', axis=1)
combined_data_reduced_df = combined_data_reduced_df.drop('student_name', axis=1)
combined_data_reduced_df = combined_data_reduced_df.drop('gender', axis=1)
combined_data_reduced_df = combined_data_reduced_df.drop('year', axis=1)
combined_data_reduced_df = combined_data_reduced_df.drop('reading_score', axis=1)
combined_data_reduced_df = combined_data_reduced_df.drop('maths_score', axis=1)

In [51]:
# Group df by School Name, School Type, Total Students and Total School Budget
school_grouped = combined_data_reduced_df.groupby(["school_name", "type", "size", "budget"])

# Delete duplicates to create new summary df that is the same as the grouped df output
school_summary_df = combined_data_reduced_df.drop_duplicates()

# Sort df alphabetically by school
school_summary_df = school_summary_df.sort_values('school_name')

# Rename columns
school_summary_df = school_summary_df.rename(columns={"school_name":"School Name", "type":"School Type", "size":"Total Students", "budget":"Total School Budget"})

school_summary_df



Unnamed: 0,School Name,School Type,Total Students,Total School Budget
17871,Bailey High School,Government,4976,3124928
16013,Cabrera High School,Independent,1858,1081356
2917,Figueroa High School,Government,2949,1884411
34796,Ford High School,Government,2739,1763916
12262,Griffin High School,Independent,1468,917500
7627,Hernandez High School,Government,4635,3022020
22847,Holden High School,Independent,427,248087
0,Huang High School,Government,2917,1910635
30035,Johnson High School,Government,4761,3094650
23274,Pena High School,Independent,962,585858


In [17]:
# Create per student budget column
school_summary_df["Per Student Budget"] = school_summary_df["Total School Budget"]/school_summary_df["Total Students"]

school_summary_df

Unnamed: 0,School Name,School Type,Total Students,Total School Budget,Per Student Budget
17871,Bailey High School,Government,4976,3124928,628.0
16013,Cabrera High School,Independent,1858,1081356,582.0
2917,Figueroa High School,Government,2949,1884411,639.0
34796,Ford High School,Government,2739,1763916,644.0
12262,Griffin High School,Independent,1468,917500,625.0
7627,Hernandez High School,Government,4635,3022020,652.0
22847,Holden High School,Independent,427,248087,581.0
0,Huang High School,Government,2917,1910635,655.0
30035,Johnson High School,Government,4761,3094650,650.0
23274,Pena High School,Independent,962,585858,609.0


In [18]:
# Create df for each individual high school
bailey = students_data_df.loc[(students_data_df["school_name"] == "Bailey High School")]
cabrera = students_data_df.loc[(students_data_df["school_name"] == "Cabrera High School")]
figueroa = students_data_df.loc[(students_data_df["school_name"] == "Figueroa High School")]
ford = students_data_df.loc[(students_data_df["school_name"] == "Ford High School")]
griffin = students_data_df.loc[(students_data_df["school_name"] == "Griffin High School")]
hernandez = students_data_df.loc[(students_data_df["school_name"] == "Hernandez High School")]
holden = students_data_df.loc[(students_data_df["school_name"] == "Holden High School")]
huang = students_data_df.loc[(students_data_df["school_name"] == "Huang High School")]
johnson = students_data_df.loc[(students_data_df["school_name"] == "Johnson High School")]
pena = students_data_df.loc[(students_data_df["school_name"] == "Pena High School")]
rodriguez = students_data_df.loc[(students_data_df["school_name"] == "Rodriguez High School")]
shelton = students_data_df.loc[(students_data_df["school_name"] == "Shelton High School")]
thomas = students_data_df.loc[(students_data_df["school_name"] == "Thomas High School")]
wilson = students_data_df.loc[(students_data_df["school_name"] == "Wilson High School")]
wright = students_data_df.loc[(students_data_df["school_name"] == "Wright High School")]

In [19]:
#Create df of avg maths scores and avg reading scores
avg_scores_df = pd.DataFrame({
"School Name": ['Bailey High School', 'Cabrera High School', 'Figueroa High School', 'Ford High School', 'Griffin High School',
'Hernandez High School', 'Holden High School', 'Huang High School', 'Johnson High School', 'Pena High School', 
'Rodriguez High School', 'Shelton High School', 'Thomas High School', 'Wilson High School', 'Wright High School'],
    "Average Maths Score": [bailey["maths_score"].mean(), cabrera["maths_score"].mean(), figueroa["maths_score"].mean(), 
ford["maths_score"].mean(),griffin["maths_score"].mean(),hernandez["maths_score"].mean(),holden["maths_score"].mean(), 
huang["maths_score"].mean(),johnson["maths_score"].mean(),pena["maths_score"].mean(),rodriguez["maths_score"].mean(), 
shelton["maths_score"].mean(),thomas["maths_score"].mean(),wilson["maths_score"].mean(), wright["maths_score"].mean()],
    "Average Reading Score": [bailey["reading_score"].mean(), cabrera["reading_score"].mean(), figueroa["reading_score"].mean(), 
ford["reading_score"].mean(),griffin["reading_score"].mean(),hernandez["reading_score"].mean(),holden["reading_score"].mean(), 
huang["reading_score"].mean(),johnson["reading_score"].mean(),pena["reading_score"].mean(),rodriguez["reading_score"].mean(), 
shelton["reading_score"].mean(),thomas["reading_score"].mean(),wilson["reading_score"].mean(), wright["reading_score"].mean()]
})

avg_scores_df


Unnamed: 0,School Name,Average Maths Score,Average Reading Score
0,Bailey High School,72.352894,71.008842
1,Cabrera High School,71.657158,71.359526
2,Figueroa High School,68.698542,69.077993
3,Ford High School,69.091274,69.572472
4,Griffin High School,71.788147,71.245232
5,Hernandez High School,68.874865,69.186408
6,Holden High School,72.583138,71.660422
7,Huang High School,68.935207,68.910525
8,Johnson High School,68.8431,69.039277
9,Pena High School,72.088358,71.613306


In [20]:
# Merge avg scores df to school summary df
school_summary_df = pd.merge(school_summary_df,avg_scores_df, on="School Name", how="outer")

school_summary_df

Unnamed: 0,School Name,School Type,Total Students,Total School Budget,Per Student Budget,Average Maths Score,Average Reading Score
0,Bailey High School,Government,4976,3124928,628.0,72.352894,71.008842
1,Cabrera High School,Independent,1858,1081356,582.0,71.657158,71.359526
2,Figueroa High School,Government,2949,1884411,639.0,68.698542,69.077993
3,Ford High School,Government,2739,1763916,644.0,69.091274,69.572472
4,Griffin High School,Independent,1468,917500,625.0,71.788147,71.245232
5,Hernandez High School,Government,4635,3022020,652.0,68.874865,69.186408
6,Holden High School,Independent,427,248087,581.0,72.583138,71.660422
7,Huang High School,Government,2917,1910635,655.0,68.935207,68.910525
8,Johnson High School,Government,4761,3094650,650.0,68.8431,69.039277
9,Pena High School,Independent,962,585858,609.0,72.088358,71.613306


In [21]:
# Create dfs of passing maths students for each school
baileypassmaths = bailey.loc[bailey["maths_score"]>=50]
cabrerapassmaths = cabrera.loc[cabrera["maths_score"]>=50]
figueroapassmaths = figueroa.loc[figueroa["maths_score"]>=50]
fordpassmaths = ford.loc[ford["maths_score"]>=50]
griffinpassmaths = griffin.loc[griffin["maths_score"]>=50]
hernandezpassmaths = hernandez.loc[hernandez["maths_score"]>=50]
holdenpassmaths = holden.loc[holden["maths_score"]>=50]
huangpassmaths = huang.loc[huang["maths_score"]>=50]
johnsonpassmaths = johnson.loc[johnson["maths_score"]>=50]
penapassmaths = pena.loc[pena["maths_score"]>=50]
rodriguezpassmaths = rodriguez.loc[rodriguez["maths_score"]>=50]
sheltonpassmaths = shelton.loc[shelton["maths_score"]>=50]
thomaspassmaths = thomas.loc[thomas["maths_score"]>=50]
wilsonpassmaths = wilson.loc[wilson["maths_score"]>=50]
wrightpassmaths = wright.loc[wright["maths_score"]>=50]

# Create dfs of passing reading students for each school
baileypassread = bailey.loc[bailey["reading_score"]>=50]
cabrerapassread = cabrera.loc[cabrera["reading_score"]>=50]
figueroapassread = figueroa.loc[figueroa["reading_score"]>=50]
fordpassread = ford.loc[ford["reading_score"]>=50]
griffinpassread = griffin.loc[griffin["reading_score"]>=50]
hernandezpassread = hernandez.loc[hernandez["reading_score"]>=50]
holdenpassread = holden.loc[holden["reading_score"]>=50]
huangpassread = huang.loc[huang["reading_score"]>=50]
johnsonpassread = johnson.loc[johnson["reading_score"]>=50]
penapassread = pena.loc[pena["reading_score"]>=50]
rodriguezpassread = rodriguez.loc[rodriguez["reading_score"]>=50]
sheltonpassread = shelton.loc[shelton["reading_score"]>=50]
thomaspassread = thomas.loc[thomas["reading_score"]>=50]
wilsonpassread = wilson.loc[wilson["reading_score"]>=50]
wrightpassread = wright.loc[wright["reading_score"]>=50]

# Create dfs of passing maths and reading students for each school
baileypassboth = baileypassmaths.loc[baileypassmaths["reading_score"]>=50]
cabrerapassboth = cabrerapassmaths.loc[cabrerapassmaths["reading_score"]>=50]
figueroapassboth = figueroapassmaths.loc[figueroapassmaths["reading_score"]>=50]
fordpassboth = fordpassmaths.loc[fordpassmaths["reading_score"]>=50]
griffinpassboth = griffinpassmaths.loc[griffinpassmaths["reading_score"]>=50]
hernandezpassboth = hernandezpassmaths.loc[hernandezpassmaths["reading_score"]>=50]
holdenpassboth = holdenpassmaths.loc[holdenpassmaths["reading_score"]>=50]
huangpassboth = huangpassmaths.loc[huangpassmaths["reading_score"]>=50]
johnsonpassboth = johnsonpassmaths.loc[johnsonpassmaths["reading_score"]>=50]
penapassboth = penapassmaths.loc[penapassmaths["reading_score"]>=50]
rodriguezpassboth = rodriguezpassmaths.loc[rodriguezpassmaths["reading_score"]>=50]
sheltonpassboth = sheltonpassmaths.loc[sheltonpassmaths["reading_score"]>=50]
thomaspassboth = thomaspassmaths.loc[thomaspassmaths["reading_score"]>=50]
wilsonpassboth = wilsonpassmaths.loc[wilsonpassmaths["reading_score"]>=50]
wrightpassboth = wrightpassmaths.loc[wrightpassmaths["reading_score"]>=50]

# New df of % of students who passed maths, reading and both
pct_passed_df = pd.DataFrame({
    "School Name": ['Bailey High School', 'Cabrera High School', 'Figueroa High School', 'Ford High School', 'Griffin High School',
    'Hernandez High School', 'Holden High School', 'Huang High School', 'Johnson High School', 'Pena High School', 
    'Rodriguez High School', 'Shelton High School', 'Thomas High School', 'Wilson High School', 'Wright High School'],
    "% Passing Maths": [(len(baileypassmaths)/len(bailey))*100,(len(cabrerapassmaths)/len(cabrera))*100, (len(figueroapassmaths)/len(figueroa))*100, (len(fordpassmaths)/len(ford))*100, 
    (len(griffinpassmaths)/len(griffin))*100, (len(hernandezpassmaths)/len(hernandez))*100, (len(holdenpassmaths)/len(holden))*100, (len(huangpassmaths)/len(huang))*100, 
    (len(johnsonpassmaths)/len(johnson))*100, (len(penapassmaths)/len(pena))*100, (len(rodriguezpassmaths)/len(rodriguez))*100, (len(sheltonpassmaths)/len(shelton))*100, 
    (len(thomaspassmaths)/len(thomas))*100, (len(wilsonpassmaths)/len(wilson))*100, (len(wrightpassmaths)/len(wright))*100],
    "% Passing Reading":[(len(baileypassread)/len(bailey))*100,(len(cabrerapassread)/len(cabrera))*100, (len(figueroapassread)/len(figueroa))*100, (len(fordpassread)/len(ford))*100, 
    (len(griffinpassread)/len(griffin))*100, (len(hernandezpassread)/len(hernandez))*100, (len(holdenpassread)/len(holden))*100, (len(huangpassread)/len(huang))*100, 
    (len(johnsonpassread)/len(johnson))*100, (len(penapassread)/len(pena))*100, (len(rodriguezpassread)/len(rodriguez))*100, (len(sheltonpassread)/len(shelton))*100, 
    (len(thomaspassread)/len(thomas))*100, (len(wilsonpassread)/len(wilson))*100, (len(wrightpassread)/len(wright))*100],
    "% Overall Passing":[(len(baileypassboth)/len(bailey))*100,(len(cabrerapassboth)/len(cabrera))*100, (len(figueroapassboth)/len(figueroa))*100, (len(fordpassboth)/len(ford))*100, 
    (len(griffinpassboth)/len(griffin))*100, (len(hernandezpassboth)/len(hernandez))*100, (len(holdenpassboth)/len(holden))*100, (len(huangpassboth)/len(huang))*100, 
    (len(johnsonpassboth)/len(johnson))*100, (len(penapassboth)/len(pena))*100, (len(rodriguezpassboth)/len(rodriguez))*100, (len(sheltonpassboth)/len(shelton))*100, 
    (len(thomaspassboth)/len(thomas))*100, (len(wilsonpassboth)/len(wilson))*100, (len(wrightpassboth)/len(wright))*100]
    })

pct_passed_df



Unnamed: 0,School Name,% Passing Maths,% Passing Reading,% Overall Passing
0,Bailey High School,91.639871,87.379421,80.084405
1,Cabrera High School,90.850377,89.074273,80.785791
2,Figueroa High School,81.654798,82.807731,67.650051
3,Ford High School,82.438846,82.219788,67.46988
4,Griffin High School,91.212534,88.487738,81.33515
5,Hernandez High School,80.949299,81.877023,66.364617
6,Holden High School,89.929742,88.52459,78.922717
7,Huang High School,81.693521,81.453548,66.712376
8,Johnson High School,82.062592,81.978576,67.191766
9,Pena High School,91.683992,86.590437,79.209979


In [22]:
# Merge % Passing scores columns to school summary df
school_summary_df = pd.merge(school_summary_df,pct_passed_df, on="School Name", how="outer")

school_summary_df

Unnamed: 0,School Name,School Type,Total Students,Total School Budget,Per Student Budget,Average Maths Score,Average Reading Score,% Passing Maths,% Passing Reading,% Overall Passing
0,Bailey High School,Government,4976,3124928,628.0,72.352894,71.008842,91.639871,87.379421,80.084405
1,Cabrera High School,Independent,1858,1081356,582.0,71.657158,71.359526,90.850377,89.074273,80.785791
2,Figueroa High School,Government,2949,1884411,639.0,68.698542,69.077993,81.654798,82.807731,67.650051
3,Ford High School,Government,2739,1763916,644.0,69.091274,69.572472,82.438846,82.219788,67.46988
4,Griffin High School,Independent,1468,917500,625.0,71.788147,71.245232,91.212534,88.487738,81.33515
5,Hernandez High School,Government,4635,3022020,652.0,68.874865,69.186408,80.949299,81.877023,66.364617
6,Holden High School,Independent,427,248087,581.0,72.583138,71.660422,89.929742,88.52459,78.922717
7,Huang High School,Government,2917,1910635,655.0,68.935207,68.910525,81.693521,81.453548,66.712376
8,Johnson High School,Government,4761,3094650,650.0,68.8431,69.039277,82.062592,81.978576,67.191766
9,Pena High School,Independent,962,585858,609.0,72.088358,71.613306,91.683992,86.590437,79.209979


In [23]:
# Format total school budget column with "$"
school_summary_df["Total School Budget"] = school_summary_df["Total School Budget"].map("${:,}".format)

# Format per student budget column with "$"
school_summary_df["Per Student Budget"] = school_summary_df["Per Student Budget"].map("${:.0f}".format)

school_summary_df

Unnamed: 0,School Name,School Type,Total Students,Total School Budget,Per Student Budget,Average Maths Score,Average Reading Score,% Passing Maths,% Passing Reading,% Overall Passing
0,Bailey High School,Government,4976,"$3,124,928",$628,72.352894,71.008842,91.639871,87.379421,80.084405
1,Cabrera High School,Independent,1858,"$1,081,356",$582,71.657158,71.359526,90.850377,89.074273,80.785791
2,Figueroa High School,Government,2949,"$1,884,411",$639,68.698542,69.077993,81.654798,82.807731,67.650051
3,Ford High School,Government,2739,"$1,763,916",$644,69.091274,69.572472,82.438846,82.219788,67.46988
4,Griffin High School,Independent,1468,"$917,500",$625,71.788147,71.245232,91.212534,88.487738,81.33515
5,Hernandez High School,Government,4635,"$3,022,020",$652,68.874865,69.186408,80.949299,81.877023,66.364617
6,Holden High School,Independent,427,"$248,087",$581,72.583138,71.660422,89.929742,88.52459,78.922717
7,Huang High School,Government,2917,"$1,910,635",$655,68.935207,68.910525,81.693521,81.453548,66.712376
8,Johnson High School,Government,4761,"$3,094,650",$650,68.8431,69.039277,82.062592,81.978576,67.191766
9,Pena High School,Independent,962,"$585,858",$609,72.088358,71.613306,91.683992,86.590437,79.209979


## Top Performing Schools (By % Overall Passing)

In [24]:
# Sort schools by % overall passing to get top performing
top_overall = school_summary_df.sort_values("% Overall Passing", ascending=False)
top_overall.head()

Unnamed: 0,School Name,School Type,Total Students,Total School Budget,Per Student Budget,Average Maths Score,Average Reading Score,% Passing Maths,% Passing Reading,% Overall Passing
4,Griffin High School,Independent,1468,"$917,500",$625,71.788147,71.245232,91.212534,88.487738,81.33515
1,Cabrera High School,Independent,1858,"$1,081,356",$582,71.657158,71.359526,90.850377,89.074273,80.785791
0,Bailey High School,Government,4976,"$3,124,928",$628,72.352894,71.008842,91.639871,87.379421,80.084405
14,Wright High School,Independent,1800,"$1,049,400",$583,72.047222,70.969444,91.777778,86.666667,79.722222
10,Rodriguez High School,Government,3999,"$2,547,363",$637,72.047762,70.935984,90.797699,87.396849,79.419855


## Bottom Performing Schools (By % Overall Passing)

In [25]:
# Sort schools by % overall passing to get bottom performing
bottom_overall = school_summary_df.sort_values("% Overall Passing", ascending=True)
bottom_overall.head()

Unnamed: 0,School Name,School Type,Total Students,Total School Budget,Per Student Budget,Average Maths Score,Average Reading Score,% Passing Maths,% Passing Reading,% Overall Passing
5,Hernandez High School,Government,4635,"$3,022,020",$652,68.874865,69.186408,80.949299,81.877023,66.364617
7,Huang High School,Government,2917,"$1,910,635",$655,68.935207,68.910525,81.693521,81.453548,66.712376
8,Johnson High School,Government,4761,"$3,094,650",$650,68.8431,69.039277,82.062592,81.978576,67.191766
13,Wilson High School,Independent,2283,"$1,319,574",$578,69.170828,68.876916,82.785808,81.29654,67.455103
3,Ford High School,Government,2739,"$1,763,916",$644,69.091274,69.572472,82.438846,82.219788,67.46988


## Maths Scores by Year

In [26]:
# Create df of all year 9, 10, 11, 12 students
yr9_df = combined_data_df.loc[(combined_data_df["year"] == 9),:]
yr10_df = combined_data_df.loc[(combined_data_df["year"] == 10),:]
yr11_df = combined_data_df.loc[(combined_data_df["year"] == 11),:]
yr12_df = combined_data_df.loc[(combined_data_df["year"] == 12),:]

# Group each df by school name
grouped_yr9_df = yr9_df.groupby(["school_name"])
grouped_yr10_df = yr10_df.groupby(["school_name"])
grouped_yr11_df = yr11_df.groupby(["school_name"])
grouped_yr12_df = yr12_df.groupby(["school_name"])

# Calculate average of of maths scores for each school and year in a series
avg_yr9_maths_df = grouped_yr9_df["maths_score"].mean()
avg_yr10_maths_df = grouped_yr10_df["maths_score"].mean()
avg_yr11_maths_df = grouped_yr11_df["maths_score"].mean()
avg_yr12_maths_df = grouped_yr12_df["maths_score"].mean()


In [54]:
# Create new df by merging series representing individual years
mathsscoresbyyear1_df = pd.merge(avg_yr9_maths_df,avg_yr10_maths_df, on="school_name",how="outer")
mathsscoresbyyear2_df = pd.merge(avg_yr11_maths_df,avg_yr12_maths_df, on="school_name",how="outer")
mathsscoresbyyear_df = pd.merge(mathsscoresbyyear1_df,mathsscoresbyyear2_df, on="school_name",how="outer")

mathsscoresbyyear_df.head()


Unnamed: 0_level_0,maths_score_x_x,maths_score_y_x,maths_score_x_y,maths_score_y_y
school_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Bailey High School,72.493827,71.897498,72.3749,72.675097
Cabrera High School,72.32197,72.437768,71.008299,70.604712
Figueroa High School,68.477804,68.331586,68.811001,69.325282
Ford High School,69.021609,69.387006,69.248862,68.617811
Griffin High School,72.789731,71.093596,71.692521,71.469178


In [55]:
# Rename df columns
mathsscoresbyyear_df = mathsscoresbyyear_df.rename(columns={"maths_score_x_x":"Year 9", "maths_score_y_x": "Year 10", "maths_score_x_y": "Year 11", "maths_score_y_y": "Year 12"})

mathsscoresbyyear_df

Unnamed: 0_level_0,Year 9,Year 10,Year 11,Year 12
school_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Bailey High School,72.493827,71.897498,72.3749,72.675097
Cabrera High School,72.32197,72.437768,71.008299,70.604712
Figueroa High School,68.477804,68.331586,68.811001,69.325282
Ford High School,69.021609,69.387006,69.248862,68.617811
Griffin High School,72.789731,71.093596,71.692521,71.469178
Hernandez High School,68.586831,68.867156,69.154412,68.985075
Holden High School,70.543307,75.105263,71.640777,73.409639
Huang High School,69.081754,68.533246,69.431345,68.639316
Johnson High School,69.469286,67.99022,68.63773,69.287393
Pena High School,71.996364,72.396,72.523438,71.187845


## Reading Scores by Year

In [29]:
# Calculate average of of reading scores for each school and year in a series
avg_yr9_read_df = grouped_yr9_df["reading_score"].mean()
avg_yr10_read_df = grouped_yr10_df["reading_score"].mean()
avg_yr11_read_df = grouped_yr11_df["reading_score"].mean()
avg_yr12_read_df = grouped_yr12_df["reading_score"].mean()

# Create new df by merging series representing individual years
readscoresbyyear1_df = pd.merge(avg_yr9_read_df,avg_yr10_read_df, on="school_name",how="outer")
readscoresbyyear2_df = pd.merge(avg_yr11_read_df,avg_yr12_read_df, on="school_name",how="outer")
readscoresbyyear_df = pd.merge(readscoresbyyear1_df,readscoresbyyear2_df, on="school_name",how="outer")

# Rename df columns
readscoresbyyear_df = readscoresbyyear_df.rename(columns={"reading_score_x_x":"Year 9", "reading_score_y_x": "Year 10", "reading_score_x_y": "Year 11", "reading_score_y_y": "Year 12"})

readscoresbyyear_df.head(15)

Unnamed: 0_level_0,Year 9,Year 10,Year 11,Year 12
school_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Bailey High School,70.90192,70.848265,70.317346,72.195525
Cabrera High School,71.172348,71.328326,71.201245,71.856021
Figueroa High School,70.261682,67.677588,69.152327,69.082126
Ford High School,69.615846,68.988701,70.735964,68.849722
Griffin High School,72.026895,70.746305,72.385042,69.434932
Hernandez High School,68.477569,70.621842,68.418199,69.244136
Holden High School,71.598425,71.096491,73.31068,70.481928
Huang High School,68.670616,69.516297,68.740638,68.671795
Johnson High School,68.719286,69.295029,69.969115,67.992521
Pena High School,70.949091,72.324,71.703125,71.513812


## Scores by School Spending

In [30]:
# Calcualte max per student budget value
school_summary_df["Per Student Budget"].max()


'$655'

In [31]:
# Calcualte min per student budget value
school_summary_df["Per Student Budget"].min()

'$578'

In [32]:
# Create array of total school budget to replace objects with floats
totalschoolbudget_string_array = school_summary_df["Total School Budget"].unique()
totalschoolbudget_string_array

array(['$3,124,928', '$1,081,356', '$1,884,411', '$1,763,916', '$917,500',
       '$3,022,020', '$248,087', '$1,910,635', '$3,094,650', '$585,858',
       '$2,547,363', '$1,056,600', '$1,043,130', '$1,319,574',
       '$1,049,400'], dtype=object)

In [33]:
# Create array of per student budget to replace objects with floats
perstudentbudget_string_array = school_summary_df["Per Student Budget"].unique()
perstudentbudget_string_array

array(['$628', '$582', '$639', '$644', '$625', '$652', '$581', '$655',
       '$650', '$609', '$637', '$600', '$638', '$578', '$583'],
      dtype=object)

In [34]:
# Replace total school budget column strings with integers
school_summary_df["Total School Budget"] = school_summary_df["Total School Budget"].replace(['$3,124,928', 
       '$1,081,356', '$1,884,411', '$1,763,916', '$917,500', '$3,022,020', '$248,087', '$1,910,635', '$3,094,650', 
       '$585,858', '$2,547,363', '$1,056,600', '$1,043,130', '$1,319,574', '$1,049,400'], 
       [3124928, 1081356, 1884411, 1763916, 917500, 3022020, 248087, 1910635, 3094650, 585858, 2547363, 1056600, 
       1043130, 1319574, 1049400])

# Replace per student budget column strings with integers
school_summary_df["Per Student Budget"] = school_summary_df["Per Student Budget"].replace(['$628', '$582', '$639', 
       '$644', '$625', '$652', '$581', '$655', '$650', '$609', '$637', '$600', '$638', '$578', '$583'], 
       [628, 582, 639, 644, 625, 652, 581, 655, 650, 609, 637, 600, 638, 578, 583])

# Convert total school budget column integers to floats
school_summary_df["Total School Budget"] = school_summary_df["Total School Budget"].astype(float)

# Convert per student budget column integers to floats
school_summary_df["Per Student Budget"] = school_summary_df["Per Student Budget"].astype(float)

# Check objects successfully converted
school_summary_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 15 entries, 0 to 14
Data columns (total 10 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   School Name            15 non-null     object 
 1   School Type            15 non-null     object 
 2   Total Students         15 non-null     int64  
 3   Total School Budget    15 non-null     float64
 4   Per Student Budget     15 non-null     float64
 5   Average Maths Score    15 non-null     float64
 6   Average Reading Score  15 non-null     float64
 7   % Passing Maths        15 non-null     float64
 8   % Passing Reading      15 non-null     float64
 9   % Overall Passing      15 non-null     float64
dtypes: float64(7), int64(1), object(2)
memory usage: 1.3+ KB


In [35]:
# Create bins for school spending
bins_spending = [575, 600, 625, 650, 675]

# Name bins for school spending
bins_spending_names = ["$575-600","$600-625", "$625-650", "$650-675"]

school_summary_df["Spending Ranges (Per Student)"] = pd.cut(school_summary_df["Per Student Budget"], bins_spending, labels=bins_spending_names, include_lowest=True)
school_summary_df

Unnamed: 0,School Name,School Type,Total Students,Total School Budget,Per Student Budget,Average Maths Score,Average Reading Score,% Passing Maths,% Passing Reading,% Overall Passing,Spending Ranges (Per Student)
0,Bailey High School,Government,4976,3124928.0,628.0,72.352894,71.008842,91.639871,87.379421,80.084405,$625-650
1,Cabrera High School,Independent,1858,1081356.0,582.0,71.657158,71.359526,90.850377,89.074273,80.785791,$575-600
2,Figueroa High School,Government,2949,1884411.0,639.0,68.698542,69.077993,81.654798,82.807731,67.650051,$625-650
3,Ford High School,Government,2739,1763916.0,644.0,69.091274,69.572472,82.438846,82.219788,67.46988,$625-650
4,Griffin High School,Independent,1468,917500.0,625.0,71.788147,71.245232,91.212534,88.487738,81.33515,$600-625
5,Hernandez High School,Government,4635,3022020.0,652.0,68.874865,69.186408,80.949299,81.877023,66.364617,$650-675
6,Holden High School,Independent,427,248087.0,581.0,72.583138,71.660422,89.929742,88.52459,78.922717,$575-600
7,Huang High School,Government,2917,1910635.0,655.0,68.935207,68.910525,81.693521,81.453548,66.712376,$650-675
8,Johnson High School,Government,4761,3094650.0,650.0,68.8431,69.039277,82.062592,81.978576,67.191766,$625-650
9,Pena High School,Independent,962,585858.0,609.0,72.088358,71.613306,91.683992,86.590437,79.209979,$600-625


In [36]:
# Group df by spending ranges (bins)
school_summary_spend_df = school_summary_df.groupby("Spending Ranges (Per Student)")

# Create new df showing mean values
school_summary_spend_df = school_summary_spend_df.mean()

# Drop unneeded columns
school_summary_spend_df = school_summary_spend_df.drop('Total Students', axis=1)
school_summary_spend_df = school_summary_spend_df.drop('Total School Budget', axis=1)
school_summary_spend_df = school_summary_spend_df.drop('Per Student Budget', axis=1)

school_summary_spend_df

Unnamed: 0_level_0,Average Maths Score,Average Reading Score,% Passing Maths,% Passing Reading,% Overall Passing
Spending Ranges (Per Student),Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
$575-600,71.498484,70.624823,89.376521,86.454833,77.152294
$600-625,71.938252,71.429269,91.448263,87.539088,80.272565
$625-650,70.102537,69.900563,85.407836,84.068723,71.88268
$650-675,68.905036,69.048466,81.32141,81.665285,66.538496


## Scores by School Size (Total Students)

In [37]:
# Calcualte max total students
school_summary_df["Total Students"].max()

4976

In [38]:
# Calculate min total students
school_summary_df["Total Students"].min()

427

In [39]:
# Create bins for school size (Total Students)
bins_size = [0, 1250, 2500, 3750, 5000]

# Name bins for school spending
bins_size_names = ["0-1250","1250-2500", "2500-3750", "3750-5000"]

school_summary_df["School Size"] = pd.cut(school_summary_df["Total Students"], bins_size, labels=bins_size_names, include_lowest=True)
school_summary_df

Unnamed: 0,School Name,School Type,Total Students,Total School Budget,Per Student Budget,Average Maths Score,Average Reading Score,% Passing Maths,% Passing Reading,% Overall Passing,Spending Ranges (Per Student),School Size
0,Bailey High School,Government,4976,3124928.0,628.0,72.352894,71.008842,91.639871,87.379421,80.084405,$625-650,3750-5000
1,Cabrera High School,Independent,1858,1081356.0,582.0,71.657158,71.359526,90.850377,89.074273,80.785791,$575-600,1250-2500
2,Figueroa High School,Government,2949,1884411.0,639.0,68.698542,69.077993,81.654798,82.807731,67.650051,$625-650,2500-3750
3,Ford High School,Government,2739,1763916.0,644.0,69.091274,69.572472,82.438846,82.219788,67.46988,$625-650,2500-3750
4,Griffin High School,Independent,1468,917500.0,625.0,71.788147,71.245232,91.212534,88.487738,81.33515,$600-625,1250-2500
5,Hernandez High School,Government,4635,3022020.0,652.0,68.874865,69.186408,80.949299,81.877023,66.364617,$650-675,3750-5000
6,Holden High School,Independent,427,248087.0,581.0,72.583138,71.660422,89.929742,88.52459,78.922717,$575-600,0-1250
7,Huang High School,Government,2917,1910635.0,655.0,68.935207,68.910525,81.693521,81.453548,66.712376,$650-675,2500-3750
8,Johnson High School,Government,4761,3094650.0,650.0,68.8431,69.039277,82.062592,81.978576,67.191766,$625-650,3750-5000
9,Pena High School,Independent,962,585858.0,609.0,72.088358,71.613306,91.683992,86.590437,79.209979,$600-625,0-1250


In [40]:
# Group df by school size ranges (bins)
school_summary_size_df = school_summary_df.groupby("School Size")

# Create new df showing mean values
school_summary_size_df = school_summary_size_df.mean()

# Drop unneeded columns
school_summary_size_df = school_summary_size_df.drop('Total Students', axis=1)
school_summary_size_df = school_summary_size_df.drop('Total School Budget', axis=1)
school_summary_size_df = school_summary_size_df.drop('Per Student Budget', axis=1)

school_summary_size_df

Unnamed: 0_level_0,Average Maths Score,Average Reading Score,% Passing Maths,% Passing Reading,% Overall Passing
School Size,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0-1250,72.335748,71.636864,90.806867,87.557513,79.066348
1250-2500,71.046513,70.412956,88.669768,85.811214,76.275671
2500-3750,68.908341,69.186996,81.929055,82.160356,67.277435
3750-5000,70.529655,70.042628,86.362365,84.657967,73.265161


## Scores by School Type

In [49]:
# Group df by school type andn create new df showing mean values
school_summary_type_df = school_summary_df.groupby("School Type").mean()

# Drop unneeded columns
school_summary_type_df = school_summary_type_df.drop('Total Students', axis=1)
school_summary_type_df = school_summary_type_df.drop('Total School Budget', axis=1)
school_summary_type_df = school_summary_type_df.drop('Per Student Budget', axis=1)

school_summary_type_df

Unnamed: 0_level_0,Average Maths Score,Average Reading Score,% Passing Maths,% Passing Reading,% Overall Passing
School Type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Government,69.834806,69.675929,84.462375,83.587562,70.698993
Independent,71.368822,70.718933,89.204043,86.247789,76.97334
