In [None]:
# Dependencies
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scipy.stats as sts
from scipy.stats import linregress

In [None]:
# Specify path to merged dataset
merged_dataset_to_load = "data/merged_school_zhi_value.csv"

# Read merged dataset and store into dataframe
merged_data = pd.read_csv(merged_dataset_to_load)
merged_data = merged_data.drop(columns="Unnamed: 0")
merged_data.head()

In [None]:
# Calculate the mean and median of the house values for 2019
monthly_house_values = ["2019-01", "2019-02", "2019-03", "2019-04", "2019-05", "2019-06", "2019-07", "2019-08", "2019-09", "2019-10", "2019-11","2019-12"]
merged_data["Average House Value 2019"] = merged_data[monthly_house_values].mean(axis=1)
merged_data["Median House Value 2019"] = merged_data[monthly_house_values].median(axis=1)
merged_data.head()

In [None]:
# Calculate the mean and median of house values in December of 2019
# Calculate the mean and median of science achievement scores per zip code
merged_data_group = merged_data.groupby(["Zip Code"])
average_house_value_2019 = merged_data_group["Average House Value 2019"].first()
median_house_value_2019 = merged_data_group["Median House Value 2019"].first()
average_house_value_dec2019 = merged_data_group["2019-12"].mean()
median_house_value_dec2019 = merged_data_group["2019-12"].median()
average_science_achievement_score = merged_data_group["Science Achievement"].mean()
median_science_achievement_score = merged_data_group["Science Achievement"].median()


# Create a dataframe to hold and summarize the results
house_value_science_score_summary = pd.DataFrame({"Average House Value 2019": average_house_value_2019,
                                                "Median House Value 2019": median_house_value_2019,
                                                "Dec 2019 Average House Value": average_house_value_dec2019,
                                                "Dec 2019 Median House Value": median_house_value_dec2019,
                                                "Average Science Achievement Score": average_science_achievement_score,
                                                "Median Science Achievement Score": median_science_achievement_score})

house_value_science_score_summary

In [None]:
# Create a copy of summary dataframe for formatting
house_value_science_score_summary_formatted = house_value_science_score_summary.copy()
house_value_science_score_summary_formatted.iloc[:, :4] = house_value_science_score_summary_formatted.iloc[:, :4].applymap("${:,.2f}".format)
house_value_science_score_summary_formatted.iloc[:, 4:] = house_value_science_score_summary_formatted.iloc[:, 4:].applymap("{:.2f}".format)
house_value_science_score_summary_formatted



In [None]:
# Cleanse dataframe: remove rows with missing data fields
house_value_science_score_summary_scrubbed = house_value_science_score_summary[house_value_science_score_summary.isna().any(axis=1)]
house_value_science_score_summary_scrubbed = house_value_science_score_summary.dropna()
house_value_science_score_summary_scrubbed

In [None]:
# Create a scatter plot to compare the Average Science Acheivement Score vs. Average House Value in 2019 per zip code
# Add the linear regression line to plot
# Print out the linear regression equation, r value and r-squared value along with the plot
x_values = house_value_science_score_summary_scrubbed["Average Science Achievement Score"]
y_values = house_value_science_score_summary_scrubbed["Average House Value 2019"]
(slope, intercept, rvalue, pvalue, stderr) = linregress(x_values, y_values)
regress_values = x_values * slope + intercept
line_eq = "y = " + str(round(slope,2)) + "x + " + str(round(intercept,2))
plt.scatter(x_values, y_values, marker="o", facecolors="red", edgecolors="black", s=x_values, alpha=0.75)
plt.plot(x_values,regress_values,"b-")
plt.title("House Values vs Science Achievement Scores")
plt.xlabel("Average Science Score (Per Zip Code)")
plt.ylabel("Average House Value in 2019 (Per Zip Code)")
plt.show()

print(f"The Linear Regression Equation is {line_eq}")
print(f"The Coefficient of Correlation is: {rvalue}")
print(f"The Coefficient of Determination is: {rvalue**2}")

In [None]:
# Create a scatter plot to compare the Average Science Acheivement Score vs. Average House Value in December 2019 per zip code
# Add the linear regression line to plot
# Print out the linear regression equation, r value and r-squared value along with the plot
x_values = house_value_science_score_summary_scrubbed["Average Science Achievement Score"]
y_values = house_value_science_score_summary_scrubbed["Dec 2019 Average House Value"]
(slope, intercept, rvalue, pvalue, stderr) = linregress(x_values, y_values)
regress_values = x_values * slope + intercept
line_eq = "y = " + str(round(slope,2)) + "x + " + str(round(intercept,2))
plt.scatter(x_values, y_values, marker="o", facecolors="blue", edgecolors="black", s=x_values, alpha=0.75)
plt.plot(x_values,regress_values,"r-")
plt.title("House Values vs Science Achievement Scores")
plt.xlabel("Average Science Score (Per Zip Code)")
plt.ylabel("Average House Value in Dec 2019 (Per Zip Code)")
plt.show()

print(f"The Linear Regression Equation is {line_eq}")
print(f"The Coefficient of Correlation is: {rvalue}")
print(f"The Coefficient of Determination is: {rvalue**2}")

In [None]:
# Create a scatter plot to compare the Median Science Acheivement Score vs. Median House Value in 2019 per zip code
# Add the linear regression line to plot
# Print out the linear regression equation, r value and r-squared value along with the plot
x_values = house_value_science_score_summary_scrubbed["Median Science Achievement Score"]
y_values = house_value_science_score_summary_scrubbed["Median House Value 2019"]
(slope, intercept, rvalue, pvalue, stderr) = linregress(x_values, y_values)
regress_values = x_values * slope + intercept
line_eq = "y = " + str(round(slope,2)) + "x + " + str(round(intercept,2))
plt.scatter(x_values, y_values, marker="o", facecolors="green", edgecolors="black", s=x_values, alpha=0.75)
plt.plot(x_values,regress_values,"r-")
plt.title("House Values vs Science Achievement Scores")
plt.xlabel("Median Science Score (Per Zip Code)")
plt.ylabel("Median House Value in 2019 (Per Zip Code)")
plt.show()

print(f"The Linear Regression Equation is {line_eq}")
print(f"The Coefficient of Correlation is: {rvalue}")
print(f"The Coefficient of Determination is: {rvalue**2}")

In [None]:
# Create a scatter plot to compare the Median Science Acheivement Score vs. Median House Value in December 2019 per zip code
# Add the linear regression line to plot
# Print out the linear regression equation, r value and r-squared value along with the plot
x_values = house_value_science_score_summary_scrubbed["Median Science Achievement Score"]
y_values = house_value_science_score_summary_scrubbed["Dec 2019 Median House Value"]
(slope, intercept, rvalue, pvalue, stderr) = linregress(x_values, y_values)
regress_values = x_values * slope + intercept
line_eq = "y = " + str(round(slope,2)) + "x + " + str(round(intercept,2))
plt.scatter(x_values, y_values, marker="o", facecolors="purple", edgecolors="black", s=x_values, alpha=0.75)
plt.plot(x_values,regress_values,"r-")
plt.title("House Values vs Science Achievement Scores")
plt.xlabel("Median Science Score (Per Zip Code)")
plt.ylabel("Median House Value in Dec 2019 (Per Zip Code)")
plt.show()

print(f"The Linear Regression Equation is {line_eq}")
print(f"The Coefficient of Correlation is: {rvalue}")
print(f"The Coefficient of Determination is: {rvalue**2}")