# PyCitySchools

### Note
* Instructions have been included for each segment. You do not have to follow them exactly, but they are included to help you think through the steps.

In [None]:
# Import dependencies
import pandas as pd

# Create references for each CSV file
school_data_to_load = "Resources/schools_complete.csv"
student_data_to_load = "Resources/students_complete.csv"

# Read each CSV into a pandas dataframe
school_data = pd.read_csv(school_data_to_load)
student_data = pd.read_csv(student_data_to_load)

# Combine each pandas dataframe into a single dataframe  
complete_df = pd.merge(student_data, school_data, how="left", on=["school_name", "school_name"])

# District Summary

* Calculate the total number of schools

* Calculate the total number of students

* Calculate the total budget

* Calculate the average math score 

* Calculate the average reading score

* Calculate the percentage of students with a passing math score (70 or greater)

* Calculate the percentage of students with a passing reading score (70 or greater)

* Calculate the percentage of students who passed math **and** reading (% Overall Passing)

* Create a dataframe to hold the above results

* Optional: give the displayed data cleaner formatting

In [None]:
# Due diligence to look for any incomplete rows?
complete_df.count()

In [None]:
# Declare variables, count and calculate totals, and store values
total_schools = school_data["School ID"].count()
total_students = student_data["Student ID"].count()
total_budget = school_data["budget"].sum()

In [None]:
# Declare variables, calculate averages, and store values
average_math = complete_df["math_score"].mean()
average_reading = complete_df["reading_score"].mean()

In [None]:
# Create dataframes, declare variables, apply conditional statements using .loc, calculate percentages, and store values
passing_math_scores = complete_df.loc[complete_df["math_score"] >= 70, :]
passed_math = passing_math_scores["Student ID"].count()
math_percentage = (passed_math / total_students) * 100

passing_reading_scores = complete_df.loc[complete_df["reading_score"] >= 70, :]
passed_reading = passing_reading_scores["Student ID"].count()
reading_percentage = (passed_reading / total_students) * 100

In [None]:
# Create dataframe, declare variable, apply conditional statement using .loc, 
# Run a .count based on "student ID" column, and store value
passing_both = complete_df.loc[(complete_df["math_score"] >= 70) & (complete_df["reading_score"] >= 70), :]
passed_both = passing_both["Student ID"].count()

In [None]:
# Declare variable, calculate percentage, and store value
passed_both_percentage = (passed_both / total_students) * 100

In [None]:
# Create a dataframe to hold the above results
formatted_df = pd.DataFrame({"Total Schools":[total_schools],
                               "Total Students":[total_students],
                               "Total Budget":[total_budget],
                               "Average Math Score":[average_math],
                               "Average Reading Score":[average_reading],
                               "% Passing Math":[math_percentage],
                               "% Passing Reading":[reading_percentage],
                               "% Overall Passing":[passed_both_percentage]})

In [None]:
# Format the values for cleaner look in final District Summary table
formatted_df["Total Students"] = formatted_df["Total Students"].map("{:,}".format)
formatted_df["Total Budget"] = formatted_df["Total Budget"].map("${:,}".format)
formatted_df["Average Math Score"] = formatted_df["Average Math Score"].map("{:,.4f}".format)
formatted_df["Average Reading Score"] = formatted_df["Average Reading Score"].map("{:,.4f}".format)
formatted_df["% Passing Math"] = formatted_df["% Passing Math"].map("{:,.2f}%".format)
formatted_df["% Passing Reading"] = formatted_df["% Passing Reading"].map("{:,.2f}%".format)
formatted_df["% Overall Passing"] = formatted_df["% Overall Passing"].map("{:,.2f}%".format)

In [None]:
# Print District Summary table to display
formatted_df

## School Summary

* Create an overview table that summarizes key metrics about each school, including:
  * School Name
  * School Type
  * Total Students
  * Total School Budget
  * Per Student Budget
  * Average Math Score
  * Average Reading Score
  * % Passing Math
  * % Passing Reading
  * % Overall Passing (The percentage of students that passed math **and** reading.)
  
* Create a dataframe to hold the above results

In [None]:
# Create dataframe using name of school as index and sort it alphabetically by school name
overview_df = school_data[["school_name", "type", "size", "budget"]].set_index("school_name").sort_values("school_name")
overview_df.index.name = None
# overview_df

In [None]:
# Create new dataframe with renamed columns for cleaner look using .rename
renamed_overview_df = overview_df.rename(columns={"school_name":"School Name",
                                                  "type":"School Type",
                                                  "size":"Total Students",
                                                  "budget":"Total School Budget"})
renamed_overview_df.index.name = None
#renamed_overview_df.head()

In [None]:
# Declare series and variable, extract data from each row of dataframe, calculate Per Student Budget, and store values for new column
students_per_school = complete_df['school_name'].value_counts()
budget_per_school = complete_df.groupby(['school_name']).mean()['budget']
#students_per_school = renamed_overview_df["Total Students"]
#budget_per_school = renamed_overview_df["Total School Budget"]
budget_per_student = budget_per_school / students_per_school
#budget_per_student
school_type = school_data.set_index(['school_name'])['type']
#school_type

In [None]:
# Using GroupBy on merged dataframe in order to separate the data into fields according to school name
grouped_by_school_df = complete_df.groupby("school_name")
# grouped_by_school_df.head()

In [None]:
# Declare variables, calculate averages by school using .mean, and store values
per_school_math_average = grouped_by_school_df["math_score"].mean()
per_school_reading_average = grouped_by_school_df["reading_score"].mean()
# per_school_math_average

In [None]:
# per_school_reading_average

In [None]:
# Declare variable, count number of students per school, and store values
#per_school_total_students = grouped_by_school_df["Student ID"].count()
# per_school_total_students

In [None]:
# Math scores per school
# Declare variables, apply conditional statements, calculate percentages, and store values
per_school_passing_math = complete_df[complete_df["math_score"] >= 70].groupby(["school_name"])
per_school_passed_math = per_school_passing_math["Student ID"].count()
per_school_math_percentage = per_school_passed_math / students_per_school * 100
#per_school_math_percentage

In [None]:
# Reading scores per school
# Declare variables, apply conditional statements, calculate percentages, and store values
per_school_passing_reading = complete_df[complete_df["reading_score"] >= 70].groupby(["school_name"])
per_school_passed_reading = per_school_passing_reading["Student ID"].count()
per_school_reading_percentage = per_school_passed_reading / students_per_school * 100
# per_school_reading_percentage

In [None]:
# Both math and reading scores per school
# Declare variable, apply conditional statement, and store value
per_school_passing_both = complete_df[(complete_df["math_score"] >= 70) & 
                                      (complete_df["reading_score"] >= 70)].groupby(["school_name"])
per_school_passed_both = per_school_passing_both["Student ID"].count()

# Declare variable, calculate percentage, and store value
per_school_passed_both_percentage = per_school_passed_both / students_per_school * 100
# per_school_passed_both_percentage

In [None]:
formatted_overview_df = pd.DataFrame({"School Type": school_type,
                                "Total Students":students_per_school,
                               "Total School Budget":budget_per_school,
                               "Per Student Budget":budget_per_student,
                               "Average Math Score":per_school_math_average,
                               "Average Reading Score":per_school_reading_average,
                               "% Passing Math":per_school_math_percentage,
                               "% Passing Reading":per_school_reading_percentage,
                               "% Overall Passing":per_school_passed_both_percentage})

# formatted_overview_df["Total Students"] = renamed_overview_df["Total Students"].map("{:,}".format)
formatted_overview_df["Total School Budget"] = renamed_overview_df["Total School Budget"].map("${:,}".format)
#formatted_overview_df["Per Student Budget"] = renamed_overview_df["Per Student Budget"].map("${:,}".format)
#formatted_overview_df["Average Math Score"] = renamed_overview_df["Average Math Score"].map("{:,.4f}".format)
#formatted_overview_df["Average Reading Score"] = renamed_overview_df["Average Reading Score"].map("{:,.4f}".format)
#formatted_overview_df["% Passing Math"] = renamed_overview_df["% Passing Math"].map("{:,.2f}%".format)
#formatted_overview_df["% Passing Reading"] = renamed_overview_df["% Passing Reading"].map("{:,.2f}%".format)
#formatted_overview_df["% Overall Passing"] = renamed_overview_df["% Overall Passing"].map("{:,.2f}%".format)

formatted_overview_df

In [None]:
# Append previously defined overview dataframe with new columns and respective values
renamed_overview_df["Per Student Budget"] = budget_per_student
renamed_overview_df["Average Math Score"] = per_school_math_average
renamed_overview_df["Average Reading Score"] = per_school_reading_average
renamed_overview_df["% Passing Math"] = per_school_math_percentage
renamed_overview_df["% Passing Reading"] = per_school_reading_percentage
renamed_overview_df["% Overall Passing"] = per_school_passed_both_percentage
renamed_overview_df.index.name = None
renamed_overview_df

In [None]:
# Create another dataframe to hold formatted values for cleaner look in final School Summary table
# Does this need to be in same cell as above???

# formatted_overview_df = pd.DataFrame({"Total Students":[per_school_total_students],
#                                "Total School Budget":[budget_per_school],
#                                "Per Student Budget":[budget_per_student],
#                                "Average Math Score":[per_school_math_average],
#                                "Average Reading Score":[per_school_reading_average],
#                                "% Passing Math":[per_school_math_percentage],
#                                "% Passing Reading":[per_school_reading_percentage],
#                                "% Overall Passing":[per_school_passed_both_percentage]})
# formatted_overview_df

In [None]:
# Apply formatted values for cleaner look in final School Summary table
# formatted_overview_df["Total Students"] = renamed_overview_df["Total Students"].map("{:,}".format)
# formatted_overview_df["Total School Budget"] = renamed_overview_df["Total School Budget"].map("${:,}".format)
# formatted_overview_df["Per Student Budget"] = renamed_overview_df["Per Student Budget"].map("${:2.0f}".format)
# formatted_overview_df["Average Math Score"] = renamed_overview_df["Average Math Score"].map("{:,.4f}".format)
# formatted_overview_df["Average Reading Score"] = renamed_overview_df["Average Reading Score"].map("{:,.4f}".format)
# formatted_overview_df["% Passing Math"] = renamed_overview_df["% Passing Math"].map("{:,.2f}%".format)
# formatted_overview_df["% Passing Reading"] = renamed_overview_df["% Passing Reading"].map("{:,.2f}%".format)
# formatted_overview_df["% Overall Passing"] = renamed_overview_df["% Overall Passing"].map("{:,.2f}%".format)

In [None]:
# Format the values for cleaner look in final School Summary table

# Should I create a new dataframe with formats instead of overwrite the existing one so "per student budget" remains float64???
# I need the data types to remain numbers for slicing and placing into bins later
# Do an .astype on these columns?  02:11:00 on lecture video 3

# renamed_overview_df["Total Students"] = renamed_overview_df["Total Students"].map("{:,}".format)
# renamed_overview_df["Total School Budget"] = renamed_overview_df["Total School Budget"].map("${:,}".format)
renamed_overview_df["Per Student Budget"] = renamed_overview_df["Per Student Budget"].map("${:2.0f}".format)
renamed_overview_df["Average Math Score"] = renamed_overview_df["Average Math Score"].map("{:,.4f}".format)
renamed_overview_df["Average Reading Score"] = renamed_overview_df["Average Reading Score"].map("{:,.4f}".format)
renamed_overview_df["% Passing Math"] = renamed_overview_df["% Passing Math"].map("{:,.2f}%".format)
renamed_overview_df["% Passing Reading"] = renamed_overview_df["% Passing Reading"].map("{:,.2f}%".format)
renamed_overview_df["% Overall Passing"] = renamed_overview_df["% Overall Passing"].map("{:,.2f}%".format)
# renamed_overview_df.dtypes

In [None]:
# Print School Summary table to display
formatted_overview_df.dtypes

## Top Performing Schools (By % Overall Passing)

* Sort and display the top five performing schools by % overall passing.

In [None]:
# Sort and display the top five performing schools by % overall passing (ascending=False must be passed in to sort high to low)
top_five_df = formatted_overview_df.sort_values("% Overall Passing", ascending = False)
top_five_df.index.name = None
top_five_df.head(5)

## Bottom Performing Schools (By % Overall Passing)

* Sort and display the five worst-performing schools by % overall passing.

In [None]:
# Sort and display the worst five performing schools by % overall passing (default sort is ascending)
worst_five_df = formatted_overview_df.sort_values("% Overall Passing")
worst_five_df.index.name = None
worst_five_df.head(5)

## Math Scores by Grade

* Create a table that lists the average math Score for students of each grade level (9th, 10th, 11th, 12th) at each school.

  * Create a pandas series for each grade. Hint: use a conditional statement.
  
  * Group each series by school
  
  * Combine the series into a dataframe
  
  * Optional: give the displayed data cleaner formatting

In [None]:
# Create a pandas series for each grade
# Using GroupBy on complete dataframe in order to separate the data into fields according to "grade"
freshman_math = complete_df.loc[complete_df["grade"] == "9th"].groupby("school_name")["math_score"].mean()
sophomore_math = complete_df.loc[complete_df["grade"] == "10th"].groupby("school_name")["math_score"].mean()
junior_math = complete_df.loc[complete_df["grade"] == "11th"].groupby("school_name")["math_score"].mean()
senior_math = complete_df.loc[complete_df["grade"] == "12th"].groupby("school_name")["math_score"].mean()

In [None]:
# Create dataframe using each series grouped by school name
math_scores_df = pd.DataFrame({"9th":freshman_math,
                            "10th":sophomore_math,
                            "11th":junior_math,
                            "12th":senior_math})
math_scores_df.index.name = None
# math_scores_df

In [None]:
# Format the values for cleaner look in final Math Scores by Grade table
math_scores_df["9th"] = math_scores_df["9th"].map("{:,.4f}".format)
math_scores_df["10th"] = math_scores_df["10th"].map("{:,.4f}".format)
math_scores_df["11th"] = math_scores_df["11th"].map("{:,.4f}".format)
math_scores_df["12th"] = math_scores_df["12th"].map("{:,.4f}".format)

In [None]:
# Print Math Scores by Grade table to display
math_scores_df

## Reading Score by Grade 

* Perform the same operations as above for reading scores

In [None]:
# Create a pandas series for each grade
# Using GroupBy on complete dataframe in order to separate the data into fields according to "grade"
freshman_reading = complete_df.loc[complete_df["grade"] == "9th"].groupby("school_name")["reading_score"].mean()
sophomore_reading = complete_df.loc[complete_df["grade"] == "10th"].groupby("school_name")["reading_score"].mean()
junior_reading = complete_df.loc[complete_df["grade"] == "11th"].groupby("school_name")["reading_score"].mean()
senior_reading = complete_df.loc[complete_df["grade"] == "12th"].groupby("school_name")["reading_score"].mean()

In [None]:
# Create dataframe using each series grouped by school name
reading_scores_df = pd.DataFrame({"9th":freshman_reading,
                                  "10th":sophomore_reading,
                                  "11th":junior_reading,
                                  "12th":senior_reading})
reading_scores_df.index.name = None
# reading_scores_df

In [None]:
# Format the values for cleaner look in final Reading Scores by Grade table
reading_scores_df["9th"] = reading_scores_df["9th"].map("{:,.4f}".format)
reading_scores_df["10th"] = reading_scores_df["10th"].map("{:,.4f}".format)
reading_scores_df["11th"] = reading_scores_df["11th"].map("{:,.4f}".format)
reading_scores_df["12th"] = reading_scores_df["12th"].map("{:,.4f}".format)

In [None]:
# Print Reading Scores by Grade table to display
reading_scores_df

## Scores by School Spending

* Create a table that breaks down school performances based on average Spending Ranges (Per Student). Use 4 reasonable bins to group school spending. Include in the table each of the following:
  * Average Math Score
  * Average Reading Score
  * % Passing Math
  * % Passing Reading
  * Overall Passing Rate (Average of the above two)

In [None]:
# Create new dataframe to hold values from School Summary dataframe
#scores_by_spending_df = renamed_overview_df[["Average Math Score", "Average Reading Score", "% Passing Math",
                                            # "% Passing Reading", "% Overall Passing"]]
scores_by_spending_df = formatted_overview_df

In [None]:
# Create and name bins to group school spending

bins = [0, 585, 630, 645, 680]
bin_names = ["<$585", "$585-$630", "$630-$645", "$645-$680"]

In [None]:
# Create new column to display budget per student ranges (bins), slice the data, and place it into bins
# Per Student Budget needs to be an int64 to slice data!!!
scores_by_spending_df["Budget Spending per Student"] = pd.cut(budget_per_student, bins, labels = bin_names, right=False)
scores_by_spending_df

## Scores by School Size

* Perform the same operations as above, based on school size.

In [None]:
# Create and name bins to group school spending

size_bins = [0, 1000, 2000, 5000]
size_bin_names = ["Small (<1,000)", "Medium (1,000-2,000)", "Large (2,000-5,000)"]

In [None]:
# Create new column to display budget per student ranges (bins), slice the data, and place it into bins
# Per Student Budget needs to be an int64 to slice data!!!
formatted_overview_df["School Size"] = pd.cut(students_per_school, size_bins, labels = size_bin_names, right=False)
formatted_overview_df

## Scores by School Type

* Perform the same operations as above, based on school type

In [None]:
# Create new column to display budget per student ranges (bins), slice the data, and place it into bins
# Per Student Budget needs to be an int64 to slice data!!!
math_type = formatted_overview_df.groupby(["School Type"]).mean()["Average Math Score"]
reading_type = formatted_overview_df.groupby(["School Type"]).mean()["Average Reading Score"]
type_math_percent = formatted_overview_df.groupby(["School Type"]).mean()["% Passing Math"]
type_reading_percent = formatted_overview_df.groupby(["School Type"]).mean()["% Passing Reading"]
type_overall_percent = formatted_overview_df.groupby(["School Type"]).mean()["% Overall Passing"]

In [None]:
# Create a summary dataframe
school_type_summary_df = pd.DataFrame({"Average Math Score":math_type,
                                       "Average Reading Score":reading_type,
                                       "% Passing Math":type_math_percent,
                                       "% Passing Reading":type_reading_percent,
                                       "% Overall Passing":type_overall_percent})

school_type_summary_df