In [1]:
# Dependencies and Setup
import pandas as pd
import numpy as np
pd.set_option('precision',2)

# File to Load (Remember to change the path if needed.)
school_data_to_load = "Resources/schools_complete.csv"
student_data_to_load = "Resources/students_complete.csv"

# Read the School Data and Student Data and store into a Pandas DataFrame
school_data_df = pd.read_csv(school_data_to_load)
student_data_df = pd.read_csv(student_data_to_load).astype({"reading_score":float, "math_score":float})

# Cleaning Student Names and Replacing Substrings in a Python String
# Add each prefix and suffix to remove to a list.
prefixes_suffixes = ["Dr. ", "Mr. ","Ms. ", "Mrs. ", "Miss ", " MD", " DDS", " DVM", " PhD"]

# Iterate through the words in the "prefixes_suffixes" list and replace them with an empty space, "".
for word in prefixes_suffixes:
    student_data_df["student_name"] = student_data_df["student_name"].str.replace(word,"")

redacted_data_df = student_data_df.copy()
redacted_data_df.loc[(redacted_data_df.grade=="9th") & (redacted_data_df.school_name=="Thomas High School"),"reading_score"]=np.nan
redacted_data_df.loc[(redacted_data_df.grade=="9th") & (redacted_data_df.school_name=="Thomas High School"),"math_score"]=np.nan

In [2]:
#Create dataframe with original reading score, modified reading score, and the difference between the values
orig_read_mean = student_data_df.groupby(["grade"]).mean()["reading_score"]
redacted_read_mean = redacted_data_df.groupby(["grade"]).mean()["reading_score"]
diff_read_mean = redacted_read_mean - orig_read_mean
read_mean_df = pd.DataFrame({'Initial Read Mean':orig_read_mean,'Final Read Mean':redacted_read_mean,'Read Mean Diff':diff_read_mean})

#Create dataframe with original math score, modified math score, and the difference between the values
orig_math_mean = student_data_df.groupby(["grade"]).mean()["math_score"]
redacted_math_mean = redacted_data_df.groupby(["grade"]).mean()["math_score"]
diff_math_mean = redacted_math_mean - orig_math_mean
math_mean_df = pd.DataFrame({'Initial Math Mean':orig_math_mean,'Final Math Mean':redacted_math_mean,'Math Mean Diff':diff_math_mean})

#Merge the math and reading dataframes
test_scores_df = pd.merge(math_mean_df,read_mean_df, left_index=True,right_index=True)
test_scores_df=test_scores_df.reindex(index = ['9th','10th','11th','12th'])

In [3]:
test_scores_df

Unnamed: 0_level_0,Initial Math Mean,Final Math Mean,Math Mean Diff,Initial Read Mean,Final Read Mean,Read Mean Diff
grade,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
9th,78.94,78.74,-0.2,81.91,81.84,-0.08
10th,78.94,78.94,0.0,81.87,81.87,0.0
11th,79.08,79.08,0.0,81.89,81.89,0.0
12th,78.99,78.99,0.0,81.82,81.82,0.0
