In [227]:
#import 
import pandas as pd
import os

In [228]:
#files to load
school_data_to_load = ("resources/schools_complete.csv")
student_data_to_load = ("resources/students_complete.csv")

In [229]:
#read csv data and add it to dataframes
school_df = pd.read_csv(school_data_to_load)
student_df = pd.read_csv(student_data_to_load)

In [230]:
# Add each prefix and suffix to remove to a list.
prefixes_suffixes = ["Dr. ", "Mr. ","Ms. ", "Mrs. ", "Miss ", " MD", " DDS", " DVM", " PhD"]
# Iterate through the words in the "prefixes_suffixes" list and replace them with an empty space, "".
for word in prefixes_suffixes:
    student_df["student_name"] = student_df["student_name"].str.replace(word,"")

In [231]:
# Combine the data into a single dataset.
school_data_complete_df = pd.merge(student_df, school_df, on=["school_name", "school_name"])

In [232]:
#get total number of students
student_count = school_data_complete_df["Student ID"].count()


In [233]:
#get total number of schools
school_count = school_df["school_name"].count()


In [234]:
#get total budget
total_budget = school_df["budget"].sum()

In [235]:
#average reading score
average_reading = school_data_complete_df["reading_score"].mean()

In [236]:
#average math score
average_math = school_data_complete_df["math_score"].mean()

In [237]:
# Get all the students who are passing math in a new DataFrame.
passing_math = school_data_complete_df[school_data_complete_df["math_score"] >= 70]

In [238]:
# Get all the students that are passing reading in a new DataFrame.
passing_reading = school_data_complete_df[school_data_complete_df["reading_score"] >= 70]

In [239]:
# Calculate the number of students passing math.
passing_math_count = passing_math["student_name"].count()
# Calculate the number of students passing reading.
passing_reading_count = passing_reading["student_name"].count()

In [240]:
#get percentages of students passing math and reading
passing_math_percentage = passing_math_count/float(student_count) * 100
passing_reading_percentage = passing_reading_count/float(student_count) * 100

In [241]:
#get students passing both math and reading
passing_math_reading = school_data_complete_df[(school_data_complete_df["math_score"] >= 70) & (school_data_complete_df["reading_score"] >= 70)]


In [242]:
#calc number of students passing both
passing_both = passing_math_reading["student_name"].count()

In [243]:
#calc percentage of students passing both
overall_passing_percent = passing_both/float(student_count) *100

In [244]:
# Adding a list of values with keys to create a new DataFrame.
district_summary_df = pd.DataFrame(
          [{"Total Schools": school_count,
          "Total Students": student_count,
          "Total Budget": total_budget,
          "Average Math Score": average_math,
          "Average Reading Score": average_reading,
          "% Passing Math": passing_math_percentage,
         "% Passing Reading": passing_reading_percentage,
        "% Overall Passing": overall_passing_percent}])


In [245]:
#format the columns to remove unwanted decimal places and add commas where necessary
district_summary_df["Total Students"] = district_summary_df["Total Students"].map("{:,}".format)
district_summary_df["Total Students"]
district_summary_df["Total Budget"] = district_summary_df["Total Budget"].map("${:,.2f}".format)
district_summary_df["Total Budget"]
district_summary_df["Average Math Score"] = district_summary_df["Average Math Score"].map("{:.1f}".format)
district_summary_df["Average Reading Score"] = district_summary_df["Average Reading Score"].map("{:.1f}".format)
district_summary_df["% Passing Math"] = district_summary_df["% Passing Math"].map("{:.0f}".format)
district_summary_df["% Passing Reading"] = district_summary_df["% Passing Reading"].map("{:.0f}".format)
district_summary_df["% Overall Passing"] = district_summary_df["% Overall Passing"].map("{:.0f}".format)

In [246]:
#determine the school type.
per_school_types = school_df.set_index(["school_name"])["type"]

In [247]:
#add school types to a dataframe for testing
df = pd.DataFrame(per_school_types)

In [248]:
# Calculate the total student count.
per_school_counts = school_df.set_index(["school_name"])["size"]

In [249]:
# Calculate the total school budget.
per_school_budget = school_df.set_index(["school_name"])["budget"]

In [250]:
# Calculate the per capita spending.
per_school_capita = per_school_budget / per_school_counts

In [251]:
# Calculate the passing scores by creating a filtered DataFrame.
per_school_passing_math = school_data_complete_df[(school_data_complete_df["math_score"] >= 70)]

per_school_passing_reading = school_data_complete_df[(school_data_complete_df["reading_score"] >= 70)]


In [252]:
# Calculate the average test scores.
per_school_math = school_data_complete_df.groupby(["school_name"]).mean()["math_score"]
per_school_reading = school_data_complete_df.groupby(["school_name"]).mean()["reading_score"]

In [253]:
# Calculate the number of students passing math and passing reading by school.
per_school_passing_math = per_school_passing_math.groupby(["school_name"]).count()["student_name"]

per_school_passing_reading = per_school_passing_reading.groupby(["school_name"]).count()["student_name"]


In [254]:
# Calculate the percentage of passing math and reading scores per school.
per_school_passing_math = per_school_passing_math / per_school_counts * 100
per_school_passing_reading = per_school_passing_reading / per_school_counts * 100

In [255]:
# Calculate the students who passed both math and reading.
per_passing_math_reading = school_data_complete_df[(school_data_complete_df["math_score"] >= 70) & (school_data_complete_df["reading_score"] >= 70)]

In [256]:
# Calculate the number of students who passed both math and reading.
per_passing_math_reading = per_passing_math_reading.groupby(["school_name"]).count()["student_name"]

In [257]:
# Calculate the overall passing percentage.
per_overall_passing_percentage = per_passing_math_reading / per_school_counts * 100

In [258]:
# Adding a list of values with keys to create a new DataFrame.
per_school_summary_df = pd.DataFrame({
             "School Type": per_school_types,
             "Total Students": per_school_counts,
             "Total School Budget": per_school_budget,
             "Per Student Budget": per_school_capita,
             "Average Math Score": per_school_math,
           "Average Reading Score": per_school_reading,
           "% Passing Math": per_school_passing_math,
           "% Passing Reading": per_school_passing_reading,
           "% Overall Passing": per_overall_passing_percentage})

In [259]:
# Format the Total School Budget and the Per Student Budget columns.
per_school_summary_df["Total School Budget"] = per_school_summary_df["Total School Budget"].map("${:,.2f}".format)
per_school_summary_df["Per Student Budget"] = per_school_summary_df["Per Student Budget"].map("${:,.2f}".format)
# Display the data frame
per_school_summary_df.head()

Unnamed: 0,School Type,Total Students,Total School Budget,Per Student Budget,Average Math Score,Average Reading Score,% Passing Math,% Passing Reading,% Overall Passing
Bailey High School,District,4976,"$3,124,928.00",$628.00,77.048432,81.033963,66.680064,81.93328,54.642283
Cabrera High School,Charter,1858,"$1,081,356.00",$582.00,83.061895,83.97578,94.133477,97.039828,91.334769
Figueroa High School,District,2949,"$1,884,411.00",$639.00,76.711767,81.15802,65.988471,80.739234,53.204476
Ford High School,District,2739,"$1,763,916.00",$644.00,77.102592,80.746258,68.309602,79.299014,54.289887
Griffin High School,Charter,1468,"$917,500.00",$625.00,83.351499,83.816757,93.392371,97.138965,90.599455
