# Authored by Nathan Cantwell (nycantwe@umich.edu).

# **Imports, G Drive mount, and read in csv files.**
Contains:
* Imported libraries
* Datasets (csv format) and source url's.
---

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats
import warnings
warnings.filterwarnings('ignore')
import os
import altair as alt
# # Remove these when converting away from G Collab
# from google.colab import drive
# drive.mount('/content/drive')

In [2]:
# Parquet file generated from notebook: "housing_market_data_cleaning.ipynb".
df_housing_market = pd.read_parquet('data/home_price_agg_by_county.parquet.gzip')
df_housing_market.head()

Unnamed: 0,FIPS_CODE,REGION,PROPERTY_TYPE,PERIOD_BEGIN,INVENTORY,HOMES_SOLD,MEDIAN_SALE_PRICE_interpolated,MEDIAN_SALE_PRICE_interpolated_inflation_adj_2012-01,MEDIAN_LIST_PRICE_interpolated,MEDIAN_LIST_PRICE_interpolated_inflation_adj_2012-01,...,MEDIAN_SALE_PRICE_no_outliers,MEDIAN_SALE_PRICE_no_outliers_inflation_adj_2012-01,MEDIAN_LIST_PRICE_no_outliers,MEDIAN_LIST_PRICE_no_outliers_inflation_adj_2012-01,CPI,original_record,MEDIAN_SALE_PRICE_is_interpolated,MEDIAN_LIST_PRICE_is_interpolated,MEDIAN_SALE_PRICE_is_outlier,MEDIAN_LIST_PRICE_is_outlier
229742,28137,"Tate County, MS",All Residential,2023-11-01,53,8,284000,384024,264900,358197,...,284000.0,384024.0,264900.0,358197.0,308.087,True,False,False,False,False
473092,54045,"Logan County, WV",All Residential,2023-01-01,46,9,72500,95606,70000,92309,...,72500.0,95606.0,70000.0,92309.0,300.456,True,False,False,False,False
315697,38053,"McKenzie County, ND",All Residential,2013-06-01,23,0,250000,255051,156600,159764,...,,,,,232.445,False,True,True,False,False
119154,18131,"Pulaski County, IN",All Residential,2021-07-01,33,17,145000,173080,174900,208770,...,145000.0,173080.0,174900.0,208770.0,271.965,True,False,False,False,False
382279,47025,"Claiborne County, TN",All Residential,2015-04-01,184,19,118500,122858,164900,170965,...,118500.0,122858.0,164900.0,170965.0,236.222,True,False,False,False,False


In [3]:
df_housing_market = df_housing_market.rename(columns={
    'PERIOD_BEGIN':'DATE',
    'MEDIAN_SALE_PRICE_interpolated_inflation_adj_2012-01':'SALE_PRICE',
    'FIPS_CODE':'FIPS'
    })
df_housing_market = df_housing_market[['FIPS', 'REGION', 'DATE', 'SALE_PRICE']]
df_housing_market['DATE'] = df_housing_market['DATE'].astype('str')
df_housing_market.head()

Unnamed: 0,FIPS,REGION,DATE,SALE_PRICE
229742,28137,"Tate County, MS",2023-11-01,384024
473092,54045,"Logan County, WV",2023-01-01,95606
315697,38053,"McKenzie County, ND",2013-06-01,255051
119154,18131,"Pulaski County, IN",2021-07-01,173080
382279,47025,"Claiborne County, TN",2015-04-01,122858


# **Building the Line Plot.**
Contains:
* Creating a line plot for national average price over time.
* Adding in annotations for onset of the COVID pandemic.
* Adding in lines for average price per state over time.
---

In [4]:
# Get mean sales price across all FIPS for each month, this is our national avg.
df_nat_avg = df_housing_market[['DATE', 'SALE_PRICE']].groupby(['DATE']).mean().reset_index()
df_nat_avg.head()

Unnamed: 0,DATE,SALE_PRICE
0,2012-01-01,141132.538462
1,2012-02-01,103580.671642
2,2012-03-01,124409.128571
3,2012-04-01,127868.213115
4,2012-05-01,110753.55


In [5]:
# tick marks on x-axis only for January 1st dates of even years.
xticks = list(df_nat_avg[df_nat_avg['DATE'].str.endswith('01-01')]['DATE'].unique())
xticks = xticks[::2]
# Use only the year value for x-axis labels.
date_axis_labels = (
    "datum.label == '2012-01-01' ? '2012'\
    : datum.label == '2014-01-01' ? '2014'\
    : datum.label == '2016-01-01' ? '2016'\
    : datum.label == '2018-01-01' ? '2018'\
    : datum.label == '2020-01-01' ? '2020'\
    : datum.label == '2022-01-01' ? '2022'\
    : datum.label == '2024-01-01' ? '2024'\
    : 'Other'"
)

# Line chart for sale price national average.
line_nat_avg = alt.Chart(
    df_nat_avg
    ).mark_line(
        color='black'
        ).encode(
            x = alt.X('DATE', axis=alt.Axis(
                values=xticks,
                labelExpr=date_axis_labels,
                labelAngle=0,
                title='Year'
                )
            ),
            y = alt.Y('SALE_PRICE', title='Home Sale Price ($)')
            ).properties(
                width=600,
                height=800,
                title=
                'National Average Home Sale Price (Inflation Adjusted, 2012)'
                )

line_nat_avg

In [6]:
# Add a single line, and text annotation for beginning of COVID lockdowns.
df_covid_point = df_nat_avg[df_nat_avg['DATE']=='2020-03-01']
df_covid_point['text1'] = 'March 15, 2020, states begin'
df_covid_point['text2'] = 'to implement shutdowns in order to'
df_covid_point['text3'] = 'prevent the spread of COVID-19.'

# Mark one red vertical line.
covid_line = alt.Chart(
    df_covid_point
    ).mark_rule(
        color='#1f77b4',
        opacity=1
        ).encode(
            x = alt.X('DATE'),
            )

# Three lines of text to fit inside chart boundaries.
covid_text1 = covid_line.mark_text(
    align="right",
    baseline="top",
    dx=-10,
    dy=-210
).encode(text='text1:N')
covid_text2 = covid_line.mark_text(
    align="right",
    baseline="top",
    dx=-10,
    dy=-197
).encode(text='text2:N')
covid_text3 = covid_line.mark_text(
    align="right",
    baseline="top",
    dx=-10,
    dy=-184
).encode(text='text3:N')

# Chart for national average line, plus COVID line and text.
covid_marks = covid_line+covid_text1+covid_text2+covid_text3
line_nat_avg+covid_marks

**Next, we add lines for rolling average sale price over time for each state. The rolling average is needed as there are some very high spikes for some states that change the y-axis scale dramatically, and harm chart effectiveness:**
* Calculate rolling average sale price, at the state level.
* Generate a line plot for each state, layer over the national average chart.

In [7]:
# Create a column for only state abbreviation.
df_housing_market['STATE'] = [region[-2:] for region in df_housing_market['REGION']]
# New df for rolling average sale price by state.
df_state_avgs = df_housing_market[['DATE','STATE', 'SALE_PRICE']].groupby(['DATE','STATE']).mean().reset_index()
df_state_avgs['ROLLING_AVG'] = df_state_avgs.groupby('STATE')['SALE_PRICE'].transform(lambda x: x.rolling(4, 1).mean())

df_state_avgs.head()

Unnamed: 0,DATE,STATE,SALE_PRICE,ROLLING_AVG
0,2012-01-01,AK,270000.0,270000.0
1,2012-01-01,AL,100500.0,100500.0
2,2012-01-01,AR,120000.0,120000.0
3,2012-01-01,CO,162950.0,162950.0
4,2012-01-01,DC,353000.0,353000.0


In [8]:
# Define helper function to generate a line chart for one state.
def get_one_line_one_state(df, state, col, xticks, yticks, ydomain):
  df_one_state = df[df['STATE']==state]

  state_line_chart = alt.Chart(
      df_one_state
      ).mark_line(
          opacity=0.25, # change opacity for spaghetti plot visibility
          color='#d95f02',
          ).encode(
              x = alt.X('DATE', axis=alt.Axis(
                  values=xticks,
                  labelExpr=date_axis_labels
                  )
              ),
              y = alt.Y(col,
                        axis=alt.Axis(values=yticks),
                        scale=alt.Scale(domain=ydomain)
                        )
              ).properties(
                  width=600,
                  height=800
                  )
  return state_line_chart

In [9]:
# Y-axis labels for sales price.
yticks = np.linspace(100000, 1400000, 14)
ydomain = [0,1400000]

# Use the above helper function for line plots by state.
all_states = df_state_avgs['STATE'].unique()

state_line_charts = [get_one_line_one_state(
    df_state_avgs,
    state,
    col='ROLLING_AVG',
    xticks=xticks,
    yticks=yticks,
    ydomain=ydomain
    )
              for state in all_states]

# Change the subtitle.
line_nat_avg = line_nat_avg.properties(
                title=alt.TitleParams(
                    ['National average sale price (in black) rises from $120,000 in 2012, to $200,000 in 2020. By 2025 this has reached nearly $400,000.',
                     'In descending order three states (MA, HI, CA) and DC appear far higher than the main group, state lines in orange.'],
                    baseline='bottom',
                    orient='bottom',
                    anchor='middle',
                    fontWeight='normal',
                    fontSize=10,
                    dy=20,
                    dx=20
                    )
                )

# Adding a text mark as the title (vconcat over the line plot).
title = alt.Chart({'values': [{
        'text':
        'State and National Average Home Sale Price (Inflation Adjusted, 2012)',
        }]}
).mark_text(size=19, align='left').encode(text='text:N')

# Layer all together.
alt.vconcat(
    title,
    alt.layer(*state_line_charts,line_nat_avg,covid_marks)
).configure_view(
    stroke=None
).configure_concat(
    spacing=1)