In [1]:
# Add the Pandas dependency
import pandas as pd

In [3]:
# Files to load
school_data_to_load = "Resources/schools_complete.csv"
student_data_to_load = "Resources/students_complete.csv"

In [4]:
# Read the school data file and store it in a Pandas DataFrame

school_data_df = pd.read_csv(school_data_to_load)
school_data_df

#If you see the error FileNotFoundErrorin your output, this means that the CSV file was not found in the Resources subfolder inside the School_District_Analysis folder.

#To fix this error, 
    # add the CSV file to the Resources subfolder. 
    # Make sure the Resources subfolder is located in the School_District_Analysis folder
    # or you can use the indirect path approach with os.path.join() method.

# Pro Tip
    # To view the first five rows of a DataFrame, use the df.head() method after the DataFrame name. 
        # For the above DataFrame, the code looks like this:
            # school_data_df.head()
    # To view the last five rows of a DataFrame, use the df.tail() method after the DataFrame name.
    # To view any number of rows in a DataFrame, place a number inside the parentheses. 
        # For example, to get the top 10 rows, use df.head(10). 
        # To get the bottom 10 rows, use df.tail(10).


Unnamed: 0,School ID,school_name,type,size,budget
0,0,Huang High School,District,2917,1910635
1,1,Figueroa High School,District,2949,1884411
2,2,Shelton High School,Charter,1761,1056600
3,3,Hernandez High School,District,4635,3022020
4,4,Griffin High School,Charter,1468,917500
5,5,Wilson High School,Charter,2283,1319574
6,6,Cabrera High School,Charter,1858,1081356
7,7,Bailey High School,District,4976,3124928
8,8,Holden High School,Charter,427,248087
9,9,Pena High School,Charter,962,585858


In [5]:
# Read the student data file and store it in a Pandas DataFrame
student_data_df = pd.read_csv(student_data_to_load)
student_data_df.head()


Unnamed: 0,Student ID,student_name,gender,grade,school_name,reading_score,math_score
0,0,Paul Bradley,M,9th,Huang High School,66,79
1,1,Victor Smith,M,12th,Huang High School,94,61
2,2,Kevin Rodriguez,M,12th,Huang High School,90,60
3,3,Dr. Richard Scott,M,12th,Huang High School,67,58
4,4,Bonnie Ray,F,9th,Huang High School,97,84


In [None]:
# FIND MISSING VALUES
    # Missing values in any of the rows are called rows with null values
    # The ways to find the null values in large datasets are
        # count() method
        # isnull() method
        # notnull() method

In [6]:
#THE count() METHOD
    # With the count() method, we can get a count of the rows for each column containing data. 
    # By default, "null" values are not counted, so you can often quickly identify which columns have missing data.

# Determine if there are any missing values in the school data.
school_data_df.count()

# The output returns the name of the columns and the number of rows that are not null. 
# For the school_data_df DataFrame, there are no missing values, because there are 15 rows that contain data in schools_complete.csv. In the output, the number 15 is next to each column header, as shown in the following image: 
    

School ID      15
school_name    15
type           15
size           15
budget         15
dtype: int64

In [7]:
# THE isnull() METHOD
    # The Pandas library also has the isnull() method for determining empty rows. 
    # When you apply the isnull() method to a column, Series, or a DataFrame, 
        # a Boolean value will be returned
            #either "True" for the row or rows that are empty, 
                # i.e., null, 
            # or "False" for the rows that are not empty

# Determine if there are any missing values in the school data.
school_data_df.isnull()

Unnamed: 0,School ID,school_name,type,size,budget
0,False,False,False,False,False
1,False,False,False,False,False
2,False,False,False,False,False
3,False,False,False,False,False
4,False,False,False,False,False
5,False,False,False,False,False
6,False,False,False,False,False
7,False,False,False,False,False
8,False,False,False,False,False
9,False,False,False,False,False


In [8]:
# Determine if there are any missing values in the student data.
student_data_df.isnull()

Unnamed: 0,Student ID,student_name,gender,grade,school_name,reading_score,math_score
0,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...
39165,False,False,False,False,False,False,False
39166,False,False,False,False,False,False,False
39167,False,False,False,False,False,False,False
39168,False,False,False,False,False,False,False


In [9]:
# Determine if there are any missing values in the student data.
# Total number of empty rows
student_data_df.isnull().sum()

Student ID       0
student_name     0
gender           0
grade            0
school_name      0
reading_score    0
math_score       0
dtype: int64

In [10]:
# THE notnull() METHOD
    # When you apply the notnull() method to a column, Series, or a DataFrame, 
        #a Boolean will be returned: 
            #"True" for the row or rows that are not empty, 
            # or "False" for the row or rows that are empty. 
    # This method returns the opposite output of the isnull() method.
    
# Determine if there are not any missing values in the school data.
school_data_df.notnull()

Unnamed: 0,School ID,school_name,type,size,budget
0,True,True,True,True,True
1,True,True,True,True,True
2,True,True,True,True,True
3,True,True,True,True,True
4,True,True,True,True,True
5,True,True,True,True,True
6,True,True,True,True,True
7,True,True,True,True,True
8,True,True,True,True,True
9,True,True,True,True,True


In [11]:
# Determine if there are not any missing values in the student data.
student_data_df.notnull().sum()

# When we execute this code, 
    # we get the number of rows that are not null, 
    # which is 39,170 for each column.

Student ID       39170
student_name     39170
gender           39170
grade            39170
school_name      39170
reading_score    39170
math_score       39170
dtype: int64

In [None]:
# HANDLE MISSING DATA
    # Do nothing
    # Drop the row that has the missing value
    # Fill in the row that has the missing value 

In [12]:
# DETERMINING DATA TYPES
    # Six Common Data Types
        # Boolean
            # Pandas Name: bool
            # Ex: "True" and "False"
        # Integer
            # Pandas Name: int32
            # Ex: Values from –2,147,483,648 to 2,147,483,647
        # Integer
            # Pandas Name: int64
            # Ex: Values from –9,223,372,036,854,775,808 to 9,223,372,036,854,775,807
        # Float
            # Pandas Name: float64
            # Ex: Floating Decimal
        # Object
            # Pandas Name: 0, object
            # Typically strings; 
                # often used as a catchall for columns with different data types or other Python objects 
                 # like tuples, lists, and dictionaries
        # Determine
            # Pandas Name: datetime64
            # Ex: Specific moment in time with nanosecond precision
                #i.e., 2019-06-03 16:04:00.465107
    # With the Pandas library, we can check the data types of each column by using the Pandas dtypes attribute on a DataFrame.

# Determine data types for the school DataFrame.
school_data_df.dtypes

# How would you find the data type of the budget column in the school data DataFrame?
    # school_data_df.budget.dtype
    # school_data_df["budget"].dtype


School ID       int64
school_name    object
type           object
size            int64
budget          int64
dtype: object

In [13]:
# Determine data types for the student DataFrame.
student_data_df.dtypes

#Based on the output
    # we determined that all of the columns we need to use for calculations are integers. 
        # Therefore, we won't need to change the data types for these columns.
        # However, there may be instances in which it's necessary to change the data type. 
            # Some CSV and text files
                # for example, may contain numbers as strings (or objects) rather than integers. 
                # These numbers would need to be converted to integers or floats.

Student ID        int64
student_name     object
gender           object
grade            object
school_name      object
reading_score     int64
math_score        int64
dtype: object

In [14]:
# Add each prefix and suffix to remove to a list.
prefixes_suffixes = ["Dr. ", "Mr. ","Ms. ", "Mrs. ", "Miss ", " MD", " DDS", " DVM", " PhD"]

In [19]:
# Iterate through the words in the "prefixes_suffixes" list and replace them with an empty space, "".
for word in prefixes_suffixes:
    student_data_df["student_name"] = student_data_df["student_name"].str.replace(word,"", regex=True)
student_data_df.head(10)

Unnamed: 0,Student ID,student_name,gender,grade,school_name,reading_score,math_score
0,0,Paul Bradley,M,9th,Huang High School,66,79
1,1,Victor Smith,M,12th,Huang High School,94,61
2,2,Kevin Rodriguez,M,12th,Huang High School,90,60
3,3,Richard Scott,M,12th,Huang High School,67,58
4,4,Bonnie Ray,F,9th,Huang High School,97,84
5,5,Bryan Miranda,M,9th,Huang High School,94,94
6,6,Sheena Carter,F,11th,Huang High School,82,80
7,7,Nicole Baker,F,12th,Huang High School,96,69
8,8,Michael Roth,M,10th,Huang High School,95,87
9,9,Matthew Greene,M,10th,Huang High School,96,84


In [20]:
# MERGE DATAFRAMES
# Combine the data into a single dataset.
school_data_complete_df = pd.merge(student_data_df, school_data_df, on=["school_name", "school_name"])
school_data_complete_df.head()

# Breaking down the code
    # We create a new DataFrame for the merged DataFrames, called school_data_complete_df.
    # The new DataFrame is created as a result of merging DataFrame #2 (school_data_df), 
        # which is the "right" DataFrame, into DataFrame #1 (student_data_df), 
        # which is the "left" DataFrame. 
        # We refer to the DataFrames as "left" and "right" to reflect the order they appear inside the parentheses.
    # We use the parameter "on," which is equal to a list of the columns that are identical from each DataFrame, 
        # in this case, "school_name." 
        # We can also use the column name like this: on="school_name".
        
# here may be cases in which you want to merge on a column that has similar information in two separate DataFrames, but is named differently in each
    # for example, "school_name" in one DataFrame and "high_school" in the second. 
    # In these cases, you should rename the columns so that they match. 
    # This will help avoid duplicate columns or merging issues.

Unnamed: 0,Student ID,student_name,gender,grade,school_name,reading_score,math_score,School ID,type,size,budget
0,0,Paul Bradley,M,9th,Huang High School,66,79,0,District,2917,1910635
1,1,Victor Smith,M,12th,Huang High School,94,61,0,District,2917,1910635
2,2,Kevin Rodriguez,M,12th,Huang High School,90,60,0,District,2917,1910635
3,3,Richard Scott,M,12th,Huang High School,67,58,0,District,2917,1910635
4,4,Bonnie Ray,F,9th,Huang High School,97,84,0,District,2917,1910635


In [145]:
# GET THE NUMBER OF STUDENTS

# Get the total number of students.
student_count = school_data_complete_df.count()

# In the output, we see that each column has 39,170 rows, or values. 
    # Therefore, we can choose any one of the columns and get the student count using the following format:
        # school_data_complete_df[column].count()
    # To assign the "student_count" to a column that identifies with students, we will use the "Student ID" column.
student_count = school_data_complete_df["Student ID"].count()

student_count

39170

In [23]:
# GET THE NUMBER OF SCHOOLS
# Calculate the total number of schools.
school_count = school_data_df["school_name"].count()
school_count

15

In [24]:
# We can't use the count() method on the school_data_complete_df["school_name"] column because this would give us a value of 39,170. 
    # If we want to use school_data_complete_df, we first need to get the unique items in the ["school_name"] column by using the unique() method. 
    # This method will return a "ndarray", or n-dimensional array of all the unique values of that column.
    
# Calculate the total number of schools
school_count_2 = school_data_complete_df["school_name"].unique()
school_count_2

    # The output of using the unique() method on a Series is a ndarray of all the high schools
        # and you know how to get the number of items in an array
# Getting the number f high schools from the array school_count_2
    # len(school_data_complete_df["school name"].unique())

array(['Huang High School', 'Figueroa High School', 'Shelton High School',
       'Hernandez High School', 'Griffin High School',
       'Wilson High School', 'Cabrera High School', 'Bailey High School',
       'Holden High School', 'Pena High School', 'Wright High School',
       'Rodriguez High School', 'Johnson High School', 'Ford High School',
       'Thomas High School'], dtype=object)

In [25]:
# GET THE TOTAL BUDGET
    # sum() method
    # COMPLETE DATAFRAME
        #total_budget = school_data_complete_df["budget"].sum()
        #total_budget
        #output: 82932329558
            #This number is huge
            #the code summed up all 39,170 rows in the budget column
    # Instead of using the sum() method on the "budget" column of the school_data_complete_df DataFrame, 
        # we can apply the sum() method on the "budget" column of the school_data_df DataFrame.

# Calculate the total budget.
total_budget = school_data_df["budget"].sum()
total_budget

24649428

In [26]:
# GET THE SCORE AVERAGES
    # The Pandas method for getting the average of columns is the mean() method.
    
# Calculate the average reading score.
average_reading_score = school_data_complete_df["reading_score"].mean()
average_reading_score


81.87784018381414

In [27]:
# Calculate the average math score.
average_math_score = school_data_complete_df["math_score"].mean()
average_math_score

78.98537145774827

In [None]:
# GET THE PASSING PERCENTAGES
    # To get the percentage of students who passed math and reading, we will write code to:
        # Determine the passing grade.
        # Get the number of students who passed math and reading in separate DataFrames.
        # Calculate the number of students who passed math and reading.
        # Calculate the percentage of students who passed math and reading.
    # To get the overall passing percentage, we will write code to:
        # Get the number of students who passed both math and reading in a DataFrame.
        # Calculate the number of students who passed both math and reading.
        # Calculate the percentage of students who passed both math and reading.

In [28]:
# Determine the Passing Grade
    # For math and reading assessment tests in this school district, the passing score was 70. 
        # Therefore, we need to get all the math and reading scores that are greater than or equal to 70. 
        # To do this, in a new cell, assign a passing_math variable to the math_score column in school_data_complete_df, where all the math scores are equal to or greater than 70.

passing_math = school_data_complete_df["math_score"] >= 70
passing_reading = school_data_complete_df["reading_score"] >= 70

In [29]:
    # To find the passing_math variable, run passing_math in a new cell. 
        # The result is Boolean values for the rows, 
            # where "True" means the score is equal to or greater than 70, 
            # and "False" means the score is not equal to or greater than 70.
passing_math

0         True
1        False
2        False
3        False
4         True
         ...  
39165     True
39166     True
39167     True
39168     True
39169     True
Name: math_score, Length: 39170, dtype: bool

In [30]:
passing_reading

0        False
1         True
2         True
3        False
4         True
         ...  
39165     True
39166     True
39167     True
39168     True
39169     True
Name: reading_score, Length: 39170, dtype: bool

In [85]:
# GET THE NUMBER OF STUDENTS WHO PASSED MATH AND READING
    # To get all the students who passed math and all the students who passed reading, 
        # we need to filter our school_data_complete_df DataFrame for the "True" cases. 
        # In other words, get only the students who have a grade is equal or greater to 70.
    # We can filter the school_data_complete_df DataFrame by adding the school_data_complete_df["math_score"] >= 70 within brackets
# Get all the students who are passing math in a new DataFrame.
passing_math = school_data_complete_df[school_data_complete_df["math_score"] >= 70]
passing_math.head()

Unnamed: 0,Student ID,student_name,gender,grade,school_name,reading_score,math_score,School ID,type,size,budget
0,0,Paul Bradley,M,9th,Huang High School,66,79,0,District,2917,1910635
4,4,Bonnie Ray,F,9th,Huang High School,97,84,0,District,2917,1910635
5,5,Bryan Miranda,M,9th,Huang High School,94,94,0,District,2917,1910635
6,6,Sheena Carter,F,11th,Huang High School,82,80,0,District,2917,1910635
8,8,Michael Roth,M,10th,Huang High School,95,87,0,District,2917,1910635


In [94]:
# Get all the students that are passing reading in a new DataFrame.
passing_reading = school_data_complete_df[school_data_complete_df["reading_score"] >= 70]

In [95]:
# Calculate the number of students passing math.
passing_math_count = passing_math["student_name"].count()

# Calculate the number of students passing reading.
passing_reading_count = passing_reading["student_name"].count()

In [96]:
print(passing_math_count)
print(passing_reading_count)

29370
33610


In [138]:
float(student_count)

39170.0

In [139]:
# Calculate the percent that passed math.
passing_math_percentage = (passing_math_count / student_count) * 100

# Calculate the percent that passed reading.
passing_reading_percentage = (passing_reading_count / student_count) * 100

print(passing_math_percentage)
print(passing_reading_percentage)

74.9808526933878
85.80546336482001


In [121]:
# Calculate the students who passed both math and reading.
    # We can filter the school_data_complete_df DataFrame by 
        # adding the school_data_complete_df["math_score"] >= 70 and school_data_complete_df["reading_score"] >= 70 with the logical operator "&" within brackets

passing_math_reading = school_data_complete_df[(school_data_complete_df["math_score"] >= 70) & (school_data_complete_df["reading_score"] >= 70)]

passing_math_reading.head()

Unnamed: 0,Student ID,student_name,gender,grade,school_name,reading_score,math_score,School ID,type,size,budget
4,4,Bonnie Ray,F,9th,Huang High School,97,84,0,District,2917,1910635
5,5,Bryan Miranda,M,9th,Huang High School,94,94,0,District,2917,1910635
6,6,Sheena Carter,F,11th,Huang High School,82,80,0,District,2917,1910635
8,8,Michael Roth,M,10th,Huang High School,95,87,0,District,2917,1910635
9,9,Matthew Greene,M,10th,Huang High School,96,84,0,District,2917,1910635


In [122]:
# Calculate the number of students who passed both math and reading.
overall_passing_math_reading_count = passing_math_reading["student_name"].count()
overall_passing_math_reading_count

25528

In [141]:
# Calculate the overall passing percentage.
overall_passing_percentage = overall_passing_math_reading_count / student_count * 100
overall_passing_percentage

65.17232575950983

In [146]:
#CREATE A DISTRICT SUMMARY DATAFRAME
    # one way to create a new DataFrame is to convert a list of dictionaries to a DataFrame.
    # To create district_summary_df DataFrame, we can create a list of dictionaries
        # where the keys are column names and the values are the metrics we calculated. 
        # We do this because the DataFrame has no index, and lists have natural indexing.
        
# Adding a list of values with keys to create a new DataFrame.
district_summary_df = pd.DataFrame(
          [{"Total Schools": school_count,
          "Total Students": student_count,
          "Total Budget": total_budget,
          "Average Math Score": average_math_score,
          "Average Reading Score": average_reading_score,
          "% Passing Math": passing_math_percentage,
         "% Passing Reading": passing_reading_percentage,
        "% Overall Passing": overall_passing_percentage}])
district_summary_df

Unnamed: 0,Total Schools,Total Students,Total Budget,Average Math Score,Average Reading Score,% Passing Math,% Passing Reading,% Overall Passing
0,15,39170,24649428,78.985371,81.87784,74.980853,85.805463,65.172326


In [None]:
# FORMAT COLUMNS
    # To clean up the district_summary_df DataFrame, we will 
        # format dollar amounts to two decimal places
        # format the grade averages to one decimal place and percentages to the nearest whole number percent
        
    # This type of formatting can be done with the built-in Pandas map() function. 
    # The map() function is used for substituting each value in a Series with another value. 
        # Where the new value is generated from a function, a dictionary, or a Series.

In [147]:
# WRITING FUNCTIONS FOR THE SCHOOL DISTRICT DATA
    # Let's say you need to write a function to get 
        # the percentage of students who passed math when we know the number of students that passed math, pass_math_count
        # and the total number of students for a school district, student_count. 
    # We can define this function as passing_math_percentand pass the two values inside the function. 
    # Here's what this might look like:
    
# Define a function that calculates the percentage of students that passed both 
# math and reading and returns the passing percentage when the function is called.

def passing_math_percent(pass_math_count, student_count):
    return pass_math_count / float(student_count) * 100

#Let's go over what is happening in this function.
    # We added two values to the passing_math_percent function: pass_math_count and student_count.
    # We added return in front of the calculation for the passing percentage.
    # The return statement has a unique purpose. 
        # It causes the function to end and literally "returns" what is in front of the statement back to the caller, which is the function. Let's run through an example of what happens to better illustrate the process.


In [148]:
# assign the passing_math_count and the total_student_count variables 29,730 and 39,170, respectively. 
    # When we run this cell, that nothing will happen until we call the function.
    
passing_math_count = 29370
total_student_count = 39170

In [149]:
# Call the function.
passing_math_percent(passing_math_count, total_student_count)

# If you don't add the number of parameters that are assigned to the function when calling the function, you'll get a TypeError:. 
# This means that you need to add one or more of the parameters when calling the function.

74.9808526933878

In [150]:
# Format the "Total Students" to have the comma for a thousands separator.
district_summary_df["Total Students"] = district_summary_df["Total Students"].map("{:,}".format)
district_summary_df["Total Students"]

# One benefit of using the format() function is that we can add other format specifications, like a U.S. dollar sign or other characters. 
    # For example, we will format the Total Budget column in the district_summary_df DataFrame with a U.S. dollar sign
    # then format the numbers with a thousands separator and to two decimal places using the following syntax: "${:,.2f}".format.

0    39,170
Name: Total Students, dtype: object

In [151]:
# Format "Total Budget" to have the comma for a thousands separator, a decimal separator, and a "$".
district_summary_df["Total Budget"] = district_summary_df["Total Budget"].map("${:,.2f}".format)
district_summary_df["Total Budget"]

0    $24,649,428.00
Name: Total Budget, dtype: object

In [168]:
district_summary_df.dtypes

Total Schools             int64
Total Students           object
Total Budget             object
Average Math Score       object
Average Reading Score    object
% Passing Math           object
% Passing Reading        object
% Overall Passing        object
dtype: object

In [166]:
#Convert object to float in pandas DataFrame
district_summary_df["Average Math Score"] = district_summary_df["Average Math Score"].astype(float)
district_summary_df["Average Reading Score"] = district_summary_df["Average Reading Score"].astype(float)
district_summary_df["% Passing Math"] = district_summary_df["% Passing Math"].astype(float)
district_summary_df["% Passing Reading"] = district_summary_df["% Passing Reading"].astype(float)
district_summary_df["% Overall Passing"] = district_summary_df["% Overall Passing"].astype(float)

In [167]:
# Format the columns
district_summary_df["Average Math Score"] = district_summary_df["Average Math Score"].map("{:.1f}".format)

district_summary_df["Average Reading Score"] = district_summary_df["Average Reading Score"].map("{:.1f}".format)

district_summary_df["% Passing Math"] = district_summary_df["% Passing Math"].map("{:.0f}".format)

district_summary_df["% Passing Reading"] = district_summary_df["% Passing Reading"].map("{:.0f}".format)

district_summary_df["% Overall Passing"] = district_summary_df["% Overall Passing"].map("{:.0f}".format)

In [170]:
#REORDERING THE COLUMNS
    # To reorder columns using Pandas, we can pass a list of columns to a current DataFrame using square bracket notation. 
    # This tells Pandas to select those specific columns and put them in the DataFrame in the same order that they appear in the list.

# Reorder the columns in the order you want them to appear.
new_column_order = ["Total Schools", "Total Students", "Total Budget","Average Math Score", "Average Reading Score", "% Passing Math", "% Passing Reading", "% Overall Passing"]

# Assign district summary df the new column order.
district_summary_df = district_summary_df[new_column_order]

In [169]:
# Display the DataFrame
district_summary_df

Unnamed: 0,Total Schools,Total Students,Total Budget,Average Math Score,Average Reading Score,% Passing Math,% Passing Reading,% Overall Passing
0,15,39170,"$24,649,428.00",79.0,81.9,75,86,65
