### Note
* Instructions have been included for each segment. You do not have to follow them exactly, but they are included to help you think through the steps.

In [1]:
# Dependencies and Setup
import pandas as pd

# --- read CSV files ---
SchoolFile = "Resources/schools_complete.csv"
StudentFile = "Resources/students_complete.csv"

# --- store file data into dataframes using Pandas library ---
SchoolData = pd.read_csv(SchoolFile)
StudentData = pd.read_csv(StudentFile)

# --- create a dictionary for format styling ---
FormatGuide = {"Total Students": "{:,}", "Total Budget": "${:,.2f}", "Total School Budget": "${:,.2f}", 
               "Per Student Budget": "${:,.2f}", "Average Math Score": "{:.2f}", "Average Reading Score": "{:.2f}", 
               "% Passing Math": "{:.2%}", "% Passing Reading": "{:.2%}", "% Overall Passing": "{:.2%}",
               "9th": "{0:.2f}%", "10th": "{0:.2f}%", "11th": "{0:.2f}%", "12th": "{0:.2f}%"}

# --- merge the data into one dataset on "school_name" using Pandas library ---
AllData = pd.merge(SchoolData, StudentData, how="left", on=["school_name"])
AllData.head()

Unnamed: 0,School ID,school_name,type,size,budget,Student ID,student_name,gender,grade,reading_score,math_score
0,0,Huang High School,District,2917,1910635,0,Paul Bradley,M,9th,66,79
1,0,Huang High School,District,2917,1910635,1,Victor Smith,M,12th,94,61
2,0,Huang High School,District,2917,1910635,2,Kevin Rodriguez,M,12th,90,60
3,0,Huang High School,District,2917,1910635,3,Dr. Richard Scott,M,12th,67,58
4,0,Huang High School,District,2917,1910635,4,Bonnie Ray,F,9th,97,84


## District Summary

* Calculate the total number of schools

* Calculate the total number of students

* Calculate the total budget

* Calculate the average math score 

* Calculate the average reading score

* Calculate the percentage of students with a passing math score (70 or greater)

* Calculate the percentage of students with a passing reading score (70 or greater)

* Calculate the percentage of students who passed math **and** reading (% Overall Passing)

* Create a dataframe to hold the above results

* Optional: give the displayed data cleaner formatting

In [2]:
# --- DISTRICT SUMMARY ---

# --- find total num of schools, students, sum of budget ---
TotalSchools = SchoolData["School ID"].count()
TotalStudents = StudentData["Student ID"].count()
TotalBudget = SchoolData["budget"].sum()

# --- calculate average of math and reading scores ---
AvgMathScore = AllData["math_score"].mean()
AvgReadingScore = AllData["reading_score"].mean()

# --- find students with a scores of 70 or greater in math and reading, and store the count in new variables ---
PassMath = AllData.loc[AllData["math_score"] >= 70 , :]
NumPassMath = PassMath["Student ID"].count()

PassReading = AllData.loc[AllData["reading_score"] >= 70, :]
NumPassReading = PassReading["Student ID"].count()

OverallPass = AllData.loc[(AllData["math_score"] >= 70) & (AllData["reading_score"] >= 70), :]
NumOverallPass = OverallPass["Student ID"].count()

# --- calculate percentage of passing students ---
PercentPassMath = (NumPassMath / TotalStudents)
PercentPassReading = (NumPassReading / TotalStudents)
PercentOverallPass = (NumOverallPass / TotalStudents)

# --- create a dataframe to hold the results and pass Format Guide to style the table ---
DistrictSummaryDF = pd.DataFrame({"Total Schools": [TotalSchools],
                   "Total Students": [TotalStudents],
                   "Total Budget": [TotalBudget],
                   "Average Math Score": [AvgMathScore],
                   "Average Reading Score": [AvgReadingScore],
                   "% Passing Math": [PercentPassMath],
                   "% Passing Reading": [PercentPassReading],
                   "% Overall Passing": [PercentOverallPass]
                    }).style.format(FormatGuide)

# --- print dataframe ---
DistrictSummaryDF


Unnamed: 0,Total Schools,Total Students,Total Budget,Average Math Score,Average Reading Score,% Passing Math,% Passing Reading,% Overall Passing
0,15,39170,"$24,649,428.00",78.99,81.88,74.98%,85.81%,65.17%


## School Summary

* Create an overview table that summarizes key metrics about each school, including:
  * School Name
  * School Type
  * Total Students
  * Total School Budget
  * Per Student Budget
  * Average Math Score
  * Average Reading Score
  * % Passing Math
  * % Passing Reading
  * % Overall Passing (The percentage of students that passed math **and** reading.)
  
* Create a dataframe to hold the above results

In [3]:
# --- SCHOOL SUMMARY ---

# --- create a new dataframe from original SchoolData dataframe, set index to school name and rename columns ---
SchoolSummary = SchoolData[["school_name", "type", "size", "budget"]].set_index("school_name").sort_values("school_name")
SchoolSummary = SchoolSummary.rename(columns= {"type": "School Type",
                                               "size": "Total Students",
                                               "budget": "Total School Budget"})

# --- get total students, total school budget and store in variables for calculations ---
TotalStudentsbySchool = SchoolSummary["Total Students"]
TotalSchoolBudget = SchoolSummary["Total School Budget"]

# --- calculate per student budget ---
PerStudentBudget = TotalSchoolBudget / TotalStudentsbySchool

# --- create a groupby object from the merged dataframe to calculate student scores by school ---
GroupedSchools = AllData.groupby("school_name")

# --- calculate math and reading score averages by school ---
SchoolMathAvg = GroupedSchools["math_score"].mean()
SchoolReadingAvg = GroupedSchools["reading_score"].mean()

# --- calculate math, reading and overall passing percentages by school --- 
SchoolMathPercent = AllData[AllData["math_score"] >= 70].groupby(['school_name']).size() / TotalStudentsbySchool
SchoolReadingPercent = AllData[AllData["reading_score"] >= 70].groupby(['school_name']).size() / TotalStudentsbySchool
SchoolOverallPercent = AllData[(AllData["math_score"] >= 70) & (AllData["reading_score"] >= 70)].groupby(['school_name']).size() / TotalStudentsbySchool

# --- add calculated series above to the School Summary dataframe ---
SchoolSummary["Per Student Budget"] = PerStudentBudget
SchoolSummary["Average Math Score"] = SchoolMathAvg
SchoolSummary["Average Reading Score"] = SchoolReadingAvg
SchoolSummary["% Passing Math"] = SchoolMathPercent
SchoolSummary["% Passing Reading"] = SchoolReadingPercent
SchoolSummary["% Overall Passing"] = SchoolOverallPercent

# --- create final school summary dataframe with formatting ---
SchoolSummaryDF = SchoolSummary.style.format(FormatGuide)
SchoolSummaryDF.index.name = None
SchoolSummaryDF

Unnamed: 0,School Type,Total Students,Total School Budget,Per Student Budget,Average Math Score,Average Reading Score,% Passing Math,% Passing Reading,% Overall Passing
Bailey High School,District,4976,"$3,124,928.00",$628.00,77.05,81.03,66.68%,81.93%,54.64%
Cabrera High School,Charter,1858,"$1,081,356.00",$582.00,83.06,83.98,94.13%,97.04%,91.33%
Figueroa High School,District,2949,"$1,884,411.00",$639.00,76.71,81.16,65.99%,80.74%,53.20%
Ford High School,District,2739,"$1,763,916.00",$644.00,77.1,80.75,68.31%,79.30%,54.29%
Griffin High School,Charter,1468,"$917,500.00",$625.00,83.35,83.82,93.39%,97.14%,90.60%
Hernandez High School,District,4635,"$3,022,020.00",$652.00,77.29,80.93,66.75%,80.86%,53.53%
Holden High School,Charter,427,"$248,087.00",$581.00,83.8,83.81,92.51%,96.25%,89.23%
Huang High School,District,2917,"$1,910,635.00",$655.00,76.63,81.18,65.68%,81.32%,53.51%
Johnson High School,District,4761,"$3,094,650.00",$650.00,77.07,80.97,66.06%,81.22%,53.54%
Pena High School,Charter,962,"$585,858.00",$609.00,83.84,84.04,94.59%,95.95%,90.54%


## Top Performing Schools (By % Overall Passing)

* Sort and display the top five performing schools by % overall passing.

In [4]:
# --- Top Performing Schools (By % Overall Passing) ---

# --- use the sort_values method on the % Overall Passing column by descending order and save the first five (head) in a variable ---
TopFive = SchoolSummary.sort_values(["% Overall Passing"], ascending = False).head()

# --- display the top 5 performing schools with formatting ---
TopFiveDF = TopFive.style.format(FormatGuide)
TopFiveDF

Unnamed: 0,School Type,Total Students,Total School Budget,Per Student Budget,Average Math Score,Average Reading Score,% Passing Math,% Passing Reading,% Overall Passing
Cabrera High School,Charter,1858,"$1,081,356.00",$582.00,83.06,83.98,94.13%,97.04%,91.33%
Thomas High School,Charter,1635,"$1,043,130.00",$638.00,83.42,83.85,93.27%,97.31%,90.95%
Griffin High School,Charter,1468,"$917,500.00",$625.00,83.35,83.82,93.39%,97.14%,90.60%
Wilson High School,Charter,2283,"$1,319,574.00",$578.00,83.27,83.99,93.87%,96.54%,90.58%
Pena High School,Charter,962,"$585,858.00",$609.00,83.84,84.04,94.59%,95.95%,90.54%


## Bottom Performing Schools (By % Overall Passing)

* Sort and display the five worst-performing schools by % overall passing.

In [5]:
# --- Bottom Performing Schools (By % Overall Passing) ---

# --- use the sort_values method on the % Overall Passing column by ascending order and save the first five (head) in a variable ---
BottomFive = SchoolSummary.sort_values(["% Overall Passing"]).head()

# --- display the bottom 5 performing schools with formatting ---
BottomFiveDF = BottomFive.style.format(FormatGuide)
BottomFiveDF

Unnamed: 0,School Type,Total Students,Total School Budget,Per Student Budget,Average Math Score,Average Reading Score,% Passing Math,% Passing Reading,% Overall Passing
Rodriguez High School,District,3999,"$2,547,363.00",$637.00,76.84,80.74,66.37%,80.22%,52.99%
Figueroa High School,District,2949,"$1,884,411.00",$639.00,76.71,81.16,65.99%,80.74%,53.20%
Huang High School,District,2917,"$1,910,635.00",$655.00,76.63,81.18,65.68%,81.32%,53.51%
Hernandez High School,District,4635,"$3,022,020.00",$652.00,77.29,80.93,66.75%,80.86%,53.53%
Johnson High School,District,4761,"$3,094,650.00",$650.00,77.07,80.97,66.06%,81.22%,53.54%


## Math Scores by Grade

* Create a table that lists the average Reading Score for students of each grade level (9th, 10th, 11th, 12th) at each school.

  * Create a pandas series for each grade. Hint: use a conditional statement.
  
  * Group each series by school
  
  * Combine the series into a dataframe
  
  * Optional: give the displayed data cleaner formatting

In [6]:
# --- Math Scores by Grade ---

# --- create a series with math averages for each grade ---
Grade9MathAvg = AllData[AllData["grade"] == "9th"].groupby("school_name")["math_score"].mean()
Grade10MathAvg = AllData[AllData["grade"] == "10th"].groupby("school_name")["math_score"].mean()
Grade11MathAvg = AllData[AllData["grade"] == "11th"].groupby("school_name")["math_score"].mean()
Grade12MathAvg = AllData[AllData["grade"] == "12th"].groupby("school_name")["math_score"].mean()

# --- create a dataframe with averages ---
GradeMathAverages = pd.DataFrame({"9th": Grade9MathAvg,
                                  "10th": Grade10MathAvg,
                                  "11th": Grade11MathAvg,
                                  "12th": Grade12MathAvg})

# --- display math averages per grade by school with formatting ---
GradeMathAveragesDF = GradeMathAverages.style.format(FormatGuide)
GradeMathAveragesDF.index.name = None
GradeMathAveragesDF


Unnamed: 0,9th,10th,11th,12th
Bailey High School,77.08%,77.00%,77.52%,76.49%
Cabrera High School,83.09%,83.15%,82.77%,83.28%
Figueroa High School,76.40%,76.54%,76.88%,77.15%
Ford High School,77.36%,77.67%,76.92%,76.18%
Griffin High School,82.04%,84.23%,83.84%,83.36%
Hernandez High School,77.44%,77.34%,77.14%,77.19%
Holden High School,83.79%,83.43%,85.00%,82.86%
Huang High School,77.03%,75.91%,76.45%,77.23%
Johnson High School,77.19%,76.69%,77.49%,76.86%
Pena High School,83.63%,83.37%,84.33%,84.12%


## Reading Score by Grade 

* Perform the same operations as above for reading scores

In [7]:
# --- Reading Score by Grade ---

# --- create a series with reading averages for each grade ---
Grade9ReadingAvg = AllData[AllData["grade"] == "9th"].groupby(["school_name"])["reading_score"].mean()
Grade10ReadingAvg = AllData[AllData["grade"] == "10th"].groupby(["school_name"])["reading_score"].mean()
Grade11ReadingAvg = AllData[AllData["grade"] == "11th"].groupby(["school_name"])["reading_score"].mean()
Grade12ReadingAvg = AllData[AllData["grade"] == "12th"].groupby(["school_name"])["reading_score"].mean()

# --- create a dataframe with averages ---
GradeReadingAverages = pd.DataFrame({"9th": Grade9ReadingAvg,
                                  "10th": Grade10ReadingAvg,
                                  "11th": Grade11ReadingAvg,
                                  "12th": Grade12ReadingAvg})

# --- display math averages per grade by school with formatting ---
GradeReadingAveragesDF = GradeReadingAverages.style.format(FormatGuide)
GradeReadingAveragesDF.index.name = None
GradeReadingAveragesDF

Unnamed: 0,9th,10th,11th,12th
Bailey High School,81.30%,80.91%,80.95%,80.91%
Cabrera High School,83.68%,84.25%,83.79%,84.29%
Figueroa High School,81.20%,81.41%,80.64%,81.38%
Ford High School,80.63%,81.26%,80.40%,80.66%
Griffin High School,83.37%,83.71%,84.29%,84.01%
Hernandez High School,80.87%,80.66%,81.40%,80.86%
Holden High School,83.68%,83.32%,83.82%,84.70%
Huang High School,81.29%,81.51%,81.42%,80.31%
Johnson High School,81.26%,80.77%,80.62%,81.23%
Pena High School,83.81%,83.61%,84.34%,84.59%


## Scores by School Spending

* Create a table that breaks down school performances based on average Spending Ranges (Per Student). Use 4 reasonable bins to group school spending. Include in the table each of the following:
  * Average Math Score
  * Average Reading Score
  * % Passing Math
  * % Passing Reading
  * Overall Passing Rate (Average of the above two)

In [8]:
# --- Scores by School Spending ---

# --- create a new dataframe from the School Summary dataframe ---
SchoolSpending = SchoolSummary[["Average Math Score", "Average Reading Score", "% Passing Math",
                                "% Passing Reading", "% Overall Passing"]]

# --- create bins to group school spending ---
SpendBins = [0, 585, 630, 645, 675]
SpendLabels = ["<$584", "$585-629", "$630-644", "$645-675"]

# --- create a new column in the dataframe and append the bins using cut() on Per Student Budget --- 
SchoolSpending["Spending Ranges (Per Student)"] = pd.cut(SchoolSummary["Per Student Budget"], SpendBins, labels = SpendLabels)

# --- create a dataframe groupby object on the spending range column ---
SchoolSpending = SchoolSpending.groupby("Spending Ranges (Per Student)").mean()

# --- display Scores by School Spending with formatting ---
SchoolSpendingDF = SchoolSpending.style.format(FormatGuide)
SchoolSpendingDF

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if sys.path[0] == '':


Unnamed: 0_level_0,Average Math Score,Average Reading Score,% Passing Math,% Passing Reading,% Overall Passing
Spending Ranges (Per Student),Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
<$584,83.46,83.93,93.46%,96.61%,90.37%
$585-629,81.9,83.16,87.13%,92.72%,81.42%
$630-644,78.52,81.62,73.48%,84.39%,62.86%
$645-675,77.0,81.03,66.16%,81.13%,53.53%


## Scores by School Size

* Perform the same operations as above, based on school size.

In [9]:
# --- Scores by School Size ---

# --- create a new dataframe from the School Summary dataframe ---
SchoolSizeScores = SchoolSummary[["Average Math Score", "Average Reading Score", "% Passing Math",
                                "% Passing Reading", "% Overall Passing"]]

# --- create bins to group school sizes ---
SizeBins = [0, 1000, 2000, 5000]
SizeLabels = ["Small (<1000)", "Medium (1000-2000)", "Large (2000-5000)"]

# --- create a new column in the dataframe and append the bins using cut() on Total Students --- 
SchoolSizeScores["School Size"] = pd.cut(SchoolSummary["Total Students"], SizeBins, labels = SizeLabels)


# --- create a dataframe groupby object on the total students column ---
SchoolSizeScores = SchoolSizeScores.groupby("School Size").mean()

# --- display Scores by School Size with formatting ---
SchoolSizeDF = SchoolSizeScores.style.format(FormatGuide)
SchoolSizeDF

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if sys.path[0] == '':


Unnamed: 0_level_0,Average Math Score,Average Reading Score,% Passing Math,% Passing Reading,% Overall Passing
School Size,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Small (<1000),83.82,83.93,93.55%,96.10%,89.88%
Medium (1000-2000),83.37,83.86,93.60%,96.79%,90.62%
Large (2000-5000),77.75,81.34,69.96%,82.77%,58.29%


## Scores by School Type

* Perform the same operations as above, based on school type

In [10]:
# --- Scores by School Type ---

# --- create a new dataframe from the School Summary dataframe ---
SchoolTypeScores = SchoolSummary[["School Type", "Average Math Score", "Average Reading Score", "% Passing Math",
                                "% Passing Reading", "% Overall Passing"]]

# --- create a dataframe groupby object on the mean of school type column ---
SchoolTypeScores = SchoolTypeScores.groupby("School Type").mean()


# --- display Scores by School Type with formatting ---
SchoolTypeDF = SchoolTypeScores.style.format(FormatGuide)
SchoolTypeDF

Unnamed: 0_level_0,Average Math Score,Average Reading Score,% Passing Math,% Passing Reading,% Overall Passing
School Type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Charter,83.47,83.9,93.62%,96.59%,90.43%
District,76.96,80.97,66.55%,80.80%,53.67%
