## **Zillow Housing Data Preprocessing**

#### **Data and Dependencies Load**

In [1]:
import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings("ignore")

pd.options.display.max_rows = 100
pd.options.display.max_columns = None

In [2]:
# Data Load
zhvi_condo = pd.read_csv('zhvi_condo.csv')
zhvi_single_family = pd.read_csv('zhvi_single_family.csv')
zhvi_one_bedroom = pd.read_csv('zhvi_one_bedroom.csv')
zhvi_two_bedroom = pd.read_csv('zhvi_two_bedroom.csv')
zhvi_three_bedroom = pd.read_csv('zhvi_three_bedroom.csv')

zori_single_family = pd.read_csv('zori_single_family.csv')
zori_multi_family = pd.read_csv('zori_multi_family.csv')

#### **Data Preprocessing**

In [3]:
# Function to transform the Time Series data to usable format

def time_series_formatter(df, feature:str, house_type:str):
    
    melted_df = pd.melt(df, id_vars=['RegionID', 'SizeRank', 'RegionName', 'RegionType', 'StateName'], var_name='Date', value_name=feature)
    melted_df.drop(columns=['RegionID', 'SizeRank', 'RegionType'], inplace=True)
    melted_df.rename(columns={'RegionName':'Region', 'StateName':'State'}, inplace=True)
    melted_df.dropna(subset=[feature], inplace=True)
    melted_df[feature] = melted_df[feature].round(2)
    melted_df['Date'] = pd.to_datetime(melted_df['Date'])
    melted_df['Type'] = house_type
    return melted_df

In [4]:
# Format Time Series Data
zhvi_condo = time_series_formatter(zhvi_condo, 'ZHVI', 'Condo')
zhvi_single_family = time_series_formatter(zhvi_single_family, 'ZHVI', 'Single Family')
zhvi_one_bedroom = time_series_formatter(zhvi_one_bedroom, 'ZHVI', 'One Bedroom')
zhvi_two_bedroom = time_series_formatter(zhvi_two_bedroom, 'ZHVI', 'Two Bedroom')
zhvi_three_bedroom = time_series_formatter(zhvi_three_bedroom, 'ZHVI', 'Three Bedroom')

zori_single_family = time_series_formatter(zori_single_family, 'ZORI', 'Single Family')
zori_multi_family = time_series_formatter(zori_multi_family, 'ZORI', 'Multi Family')

In [15]:
zori_multi_family

Unnamed: 0,Region,State,Date,ZORI,Type
0,United States,,2015-01-31,1227.73,Multi Family
1,"New York, NY",NY,2015-01-31,2271.09,Multi Family
2,"Los Angeles, CA",CA,2015-01-31,1716.81,Multi Family
3,"Chicago, IL",IL,2015-01-31,1429.76,Multi Family
4,"Dallas, TX",TX,2015-01-31,1027.88,Multi Family
...,...,...,...,...,...
55384,"Macomb, IL",IL,2024-03-31,646.67,Multi Family
55385,"Rockport, TX",TX,2024-03-31,1098.00,Multi Family
55386,"Wahpeton, ND",ND,2024-03-31,697.26,Multi Family
55387,"Jamestown, ND",ND,2024-03-31,885.83,Multi Family


In [23]:
zori_multi_family

Unnamed: 0,Region,State,Date,ZORI,Type
0,United States,,2015-01-31,1227.73,Multi Family
1,"New York, NY",NY,2015-01-31,2271.09,Multi Family
2,"Los Angeles, CA",CA,2015-01-31,1716.81,Multi Family
3,"Chicago, IL",IL,2015-01-31,1429.76,Multi Family
4,"Dallas, TX",TX,2015-01-31,1027.88,Multi Family
...,...,...,...,...,...
55384,"Macomb, IL",IL,2024-03-31,646.67,Multi Family
55385,"Rockport, TX",TX,2024-03-31,1098.00,Multi Family
55386,"Wahpeton, ND",ND,2024-03-31,697.26,Multi Family
55387,"Jamestown, ND",ND,2024-03-31,885.83,Multi Family


In [14]:
zhvi_two_bedroom.isnull().sum()

Region      0
State     291
Date        0
ZHVI        0
Type        0
dtype: int64