In [15]:
import pandas as pd

In [16]:
# Read the data, skipping the first 6 rows of hpi_zip5
hpi_zip5 = pd.read_excel('data/hpi_zip5.xlsx', skiprows=6)
zipcodesaffected = pd.read_csv('data/zip-code-csv.txt')

In [17]:
print(zipcodesaffected.columns)
print(hpi_zip5.columns)

Index(['Zip Code'], dtype='object')
Index(['Five-Digit ZIP Code', 'Year', 'Annual Change (%)', 'HPI',
       'HPI with 1990 base', 'HPI with 2000 base'],
      dtype='object')


In [22]:
# Filter hpi_zip5 for years 2017 to 2023
hpi_zip5_filtered = hpi_zip5[(hpi_zip5['Year'] >= 2017) & (hpi_zip5['Year'] <= 2023)]

# Filter hpi_zip5 for zip codes in zipcodesaffected
hpi_zip5_filtered = hpi_zip5_filtered[hpi_zip5_filtered['Five-Digit ZIP Code'].isin(zipcodesaffected['Zip Code'])]

# Perform a left join to include all zip codes from zipcodesaffected
merged_data = pd.merge(zipcodesaffected, hpi_zip5_filtered, left_on='Zip Code', right_on='Five-Digit ZIP Code', how='outer')

# Remove the 'Five-Digit ZIP Code' column
merged_data = merged_data.drop(columns=['Five-Digit ZIP Code'])

# **Convert 'Year' and 'HPI' columns in merged_data to numeric**
merged_data['Year'] = pd.to_numeric(merged_data['Year'], errors='coerce').astype('Int64')
merged_data['HPI'] = pd.to_numeric(merged_data['HPI'], errors='coerce')
merged_data['HPI with 2000 base'] = pd.to_numeric(merged_data['HPI'], errors='coerce')
merged_data['Annual Change (%)'] = pd.to_numeric(merged_data['Annual Change (%)'], errors='coerce')

# Save the merged data to a new CSV file
merged_data.to_csv('data/HPI_ZIP_year_affected.csv', index=False)

In [12]:
print(merged_data.columns)

Index(['Five-Digit ZIP Code', 'Year', 'Annual Change (%)', 'HPI',
       'HPI with 1990 base', 'HPI with 2000 base', 'Zip Codes'],
      dtype='object')


In [13]:
import matplotlib.pyplot as plt
import seaborn as sns

In [14]:
# Plotting the data
zip_codes = merged_data['Five-Digit ZIP Code'].unique()

for zip_code in zip_codes:
    zip_data = merged_data[merged_data['Five-Digit ZIP Code'] == zip_code]
    zip_data = zip_data.sort_values(by='Year')

    # Get the 'affected' status for this zip code
    affected_status = zip_data['Affected'].iloc[0]

    # Choose color based on 'affected' status
    if affected_status == 'Y':
        line_color = 'red'
    else:
        line_color = 'blue'

    plt.figure(figsize=(10, 6))
    plt.plot(zip_data['Year'], zip_data['HPI'], marker='o', linestyle='-', color=line_color)
    plt.title(f"HPI Trends for Zip Code {zip_code} (Affected: {affected_status})")
    plt.xlabel('Year')
    plt.ylabel('House Price Index (HPI)')
    plt.grid(True)
    plt.tight_layout()
    plt.savefig(f'data/HPI_Trends_Zip_Code/HPI_Trends_Zip_Code_{zip_code}.png')
    plt.close()

    plt.figure(figsize=(10, 6))
    plt.plot(zip_data['Year'], zip_data['Annual Change (%)'], marker='o', linestyle='-', color=line_color)
    plt.title(f"HPI Trends for Zip Code {zip_code} (Affected: {affected_status})")
    plt.xlabel('Year')
    plt.ylabel('HPI Growth (% Y/Y)')
    plt.grid(True)
    plt.tight_layout()
    plt.savefig(f'data/HPI_Growth_Zip_Code/HPI_Growth_Zip_Code_{zip_code}.png')
    plt.close()

KeyError: 'Affected'