In [2]:
import pandas as pd
from pathlib import Path
import numpy as np

In [3]:
# Read in the "Metro_median_sale_price.csv" file
df_sale_price = pd.read_csv(Path("../raw_data/Metro_median_sale_price.csv"))

# Display first five rows of data
df_sale_price.head()

Unnamed: 0,Date,"New York, NY","Los Angeles, CA","Chicago, IL","Houston, TX","Miami, FL","Boston, MA","Phoenix, AZ","Seattle, WA"
0,04/30/08,401195.0,499583,239167,143000,244333,322067,226300,333833
1,05/31/08,399667.0,495250,239500,145167,244000,333067,226000,334483
2,06/30/08,401667.0,490833,243667,148500,245667,344167,226000,336150
3,07/31/08,407333.0,480833,248333,152167,247333,351000,221967,334817
4,08/31/08,414000.0,467500,251667,153500,250000,353333,216633,332483


In [4]:
# Check for null values
df_sale_price.isnull().sum()

Date               0
New York, NY       3
Los Angeles, CA    0
Chicago, IL        0
Houston, TX        0
Miami, FL          0
Boston, MA         0
Phoenix, AZ        0
Seattle, WA        0
dtype: int64

In [5]:
# Replace blanks with NaN values
df_sale_price = df_sale_price.replace('',np.nan)
# Perform linear interpolation to fill missing values
df_sale_price = df_sale_price.interpolate(method='linear')

In [6]:
# Check for null values
df_sale_price.isnull().sum()

Date               0
New York, NY       0
Los Angeles, CA    0
Chicago, IL        0
Houston, TX        0
Miami, FL          0
Boston, MA         0
Phoenix, AZ        0
Seattle, WA        0
dtype: int64

In [7]:
# Convert the "Date" column so that it only displays the month and not the date
df_sale_price["Date"] = pd.to_datetime(df_sale_price["Date"]).dt.to_period('M')

# Set the index to "Date"
df_sale_price = df_sale_price.set_index("Date")

# Display the first five rows of data
df_sale_price.head()

Unnamed: 0_level_0,"New York, NY","Los Angeles, CA","Chicago, IL","Houston, TX","Miami, FL","Boston, MA","Phoenix, AZ","Seattle, WA"
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2008-04,401195.0,499583,239167,143000,244333,322067,226300,333833
2008-05,399667.0,495250,239500,145167,244000,333067,226000,334483
2008-06,401667.0,490833,243667,148500,245667,344167,226000,336150
2008-07,407333.0,480833,248333,152167,247333,351000,221967,334817
2008-08,414000.0,467500,251667,153500,250000,353333,216633,332483


In [8]:
# Check data types
df_sale_price.dtypes

New York, NY       float64
Los Angeles, CA      int64
Chicago, IL          int64
Houston, TX          int64
Miami, FL            int64
Boston, MA           int64
Phoenix, AZ          int64
Seattle, WA          int64
dtype: object

In [9]:
# Convert dta types from float to interger
df_sale_price = df_sale_price.astype("float64")

In [10]:
# Check data types
df_sale_price.dtypes

New York, NY       float64
Los Angeles, CA    float64
Chicago, IL        float64
Houston, TX        float64
Miami, FL          float64
Boston, MA         float64
Phoenix, AZ        float64
Seattle, WA        float64
dtype: object

In [11]:
# Rename columns
df_sale_price.columns = ["New York", "Los Angeles", "Chicago", "Houston", "Miami", "Boston", "Phoenix", "Seattle"]

In [12]:
# Create separate dataframes for the different regions

df_price_nyc = df_sale_price.iloc[:,0]
df_price_lax = df_sale_price.iloc[:,1]
df_price_chi = df_sale_price.iloc[:,2]
df_price_hou = df_sale_price.iloc[:,3]
df_price_mia = df_sale_price.iloc[:,4]
df_price_bos = df_sale_price.iloc[:,5]
df_price_pho = df_sale_price.iloc[:,6]
df_price_sea = df_sale_price.iloc[:,7]

In [14]:
# Import individual dataframes to folder with cleaned up data

df_price_nyc.to_csv(Path("../clean_data/housing_prices/nyc_housing_prices.csv"))
df_price_lax.to_csv(Path("../clean_data/housing_prices/lax_housing_prices.csv"))
df_price_chi.to_csv(Path("../clean_data/housing_prices/chi_housing_prices.csv"))
df_price_hou.to_csv(Path("../clean_data/housing_prices/hou_housing_prices.csv"))
df_price_mia.to_csv(Path("../clean_data/housing_prices/mia_housing_prices.csv"))
df_price_bos.to_csv(Path("../clean_data/housing_prices/bos_housing_prices.csv"))
df_price_pho.to_csv(Path("../clean_data/housing_prices/pho_housing_prices.csv"))
df_price_sea.to_csv(Path("../clean_data/housing_prices/sea_housing_prices.csv"))