# Zillow EDA
---

## Load Prprocessed Data
---

In [None]:
import sqlite3 as sql

zillow_preprocessed_data_db = '../data/Zillow/zillow.db'

zillow_conn = sql.connect(zillow_preprocessed_data_db)

In [None]:
import logging

# Create a logger
logger = logging.getLogger('zillow_eda')
logger.setLevel(logging.DEBUG)

# Create file handler which logs even debug messages
fh = logging.FileHandler('eda.log')
fh.setLevel(logging.DEBUG)

# Create console handler with a higher log level
ch = logging.StreamHandler()
ch.setLevel(logging.ERROR)

# Create formatter and add it to the handlers
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
fh.setFormatter(formatter)
ch.setFormatter(formatter)

# Add the handlers to the logger
logger.addHandler(fh)
logger.addHandler(ch)

logger.info('Logger for Zillow EDA initialized')


# California Data
---

In [None]:
import pandas as pd
zillow_ca_rental_df = pd.read_sql_query("SELECT * FROM rent WHERE StateName = 'CA'", zillow_conn, index_col=None)
logger.info('California Rental Data loaded')

In [None]:
import pygwalker as pyg

## Rental Data for SD,LA, and SF
---

In [None]:
zillow_ca_rental_df.head()

In [None]:
vis_spec = r"""{"config":[{"config":{"defaultAggregated":true,"geoms":["auto"],"coordSystem":"generic","limit":-1,"timezoneDisplayOffset":0},"encodings":{"dimensions":[{"fid":"RegionName","name":"RegionName","basename":"RegionName","semanticType":"nominal","analyticType":"dimension","offset":0},{"fid":"RegionType","name":"RegionType","basename":"RegionType","semanticType":"nominal","analyticType":"dimension","offset":0},{"fid":"StateName","name":"StateName","basename":"StateName","semanticType":"nominal","analyticType":"dimension","offset":0},{"fid":"date","name":"date","basename":"date","semanticType":"nominal","analyticType":"dimension","offset":0},{"fid":"Season","name":"Season","basename":"Season","semanticType":"nominal","analyticType":"dimension","offset":0},{"fid":"Political Leaning","name":"Political Leaning","basename":"Political Leaning","semanticType":"nominal","analyticType":"dimension","offset":0},{"fid":"gw_mea_key_fid","name":"Measure names","analyticType":"dimension","semanticType":"nominal"}],"measures":[{"fid":"index","name":"index","basename":"index","analyticType":"measure","semanticType":"quantitative","aggName":"sum","offset":0},{"fid":"RegionID","name":"RegionID","basename":"RegionID","analyticType":"measure","semanticType":"quantitative","aggName":"sum","offset":0},{"fid":"SizeRank","name":"SizeRank","basename":"SizeRank","analyticType":"measure","semanticType":"quantitative","aggName":"sum","offset":0},{"fid":"rent","name":"rent (USD)","basename":"rent","analyticType":"measure","semanticType":"quantitative","aggName":"sum","offset":0},{"fid":"gw_count_fid","name":"Row count","analyticType":"measure","semanticType":"quantitative","aggName":"sum","computed":true,"expression":{"op":"one","params":[],"as":"gw_count_fid"}},{"fid":"gw_mea_val_fid","name":"Measure values","analyticType":"measure","semanticType":"quantitative","aggName":"sum"}],"rows":[{"fid":"rent","name":"rent (USD)","basename":"rent","analyticType":"measure","semanticType":"quantitative","aggName":"median","offset":0}],"columns":[{"fid":"date","name":"date","basename":"date","semanticType":"nominal","analyticType":"dimension","offset":0}],"color":[{"fid":"Season","name":"Season","basename":"Season","semanticType":"nominal","analyticType":"dimension","offset":0}],"opacity":[],"size":[],"shape":[],"radius":[],"theta":[],"longitude":[],"latitude":[],"geoId":[],"details":[],"filters":[{"fid":"RegionName","name":"RegionName","basename":"RegionName","semanticType":"nominal","analyticType":"dimension","offset":0,"rule":{"type":"one of","value":["Los Angeles, CA"]}}],"text":[]},"layout":{"showActions":false,"showTableSummary":false,"stack":"stack","interactiveScale":false,"zeroScale":true,"size":{"mode":"full","width":320,"height":200},"format":{},"geoKey":"name","resolve":{"x":false,"y":false,"color":false,"opacity":false,"shape":false,"size":false}},"visId":"gw_dYXm","name":"Los Angeles Rental Fluctuations"},{"config":{"defaultAggregated":true,"geoms":["auto"],"coordSystem":"generic","limit":-1,"timezoneDisplayOffset":0},"encodings":{"dimensions":[{"fid":"RegionName","name":"RegionName","basename":"RegionName","semanticType":"nominal","analyticType":"dimension","offset":0},{"fid":"RegionType","name":"RegionType","basename":"RegionType","semanticType":"nominal","analyticType":"dimension","offset":0},{"fid":"StateName","name":"StateName","basename":"StateName","semanticType":"nominal","analyticType":"dimension","offset":0},{"fid":"date","name":"date","basename":"date","semanticType":"nominal","analyticType":"dimension","offset":0},{"fid":"Season","name":"Season","basename":"Season","semanticType":"nominal","analyticType":"dimension","offset":0},{"fid":"Political Leaning","name":"Political Leaning","basename":"Political Leaning","semanticType":"nominal","analyticType":"dimension","offset":0},{"fid":"gw_mea_key_fid","name":"Measure names","analyticType":"dimension","semanticType":"nominal"}],"measures":[{"fid":"index","name":"index","basename":"index","analyticType":"measure","semanticType":"quantitative","aggName":"sum","offset":0},{"fid":"RegionID","name":"RegionID","basename":"RegionID","analyticType":"measure","semanticType":"quantitative","aggName":"sum","offset":0},{"fid":"SizeRank","name":"SizeRank","basename":"SizeRank","analyticType":"measure","semanticType":"quantitative","aggName":"sum","offset":0},{"fid":"rent","name":"rent (USD)","basename":"rent","analyticType":"measure","semanticType":"quantitative","aggName":"sum","offset":0},{"fid":"gw_count_fid","name":"Row count","analyticType":"measure","semanticType":"quantitative","aggName":"sum","computed":true,"expression":{"op":"one","params":[],"as":"gw_count_fid"}},{"fid":"gw_mea_val_fid","name":"Measure values","analyticType":"measure","semanticType":"quantitative","aggName":"sum"}],"rows":[{"fid":"rent","name":"rent (USD)","basename":"rent","analyticType":"measure","semanticType":"quantitative","aggName":"median","offset":0}],"columns":[{"fid":"date","name":"date","basename":"date","semanticType":"nominal","analyticType":"dimension","offset":0}],"color":[{"fid":"Season","name":"Season","basename":"Season","semanticType":"nominal","analyticType":"dimension","offset":0}],"opacity":[],"size":[],"shape":[],"radius":[],"theta":[],"longitude":[],"latitude":[],"geoId":[],"details":[],"filters":[{"fid":"RegionName","name":"RegionName","basename":"RegionName","semanticType":"nominal","analyticType":"dimension","offset":0,"rule":{"type":"one of","value":["San Diego, CA"]}}],"text":[]},"layout":{"showActions":false,"showTableSummary":false,"stack":"stack","interactiveScale":false,"zeroScale":true,"size":{"mode":"full","width":320,"height":200},"format":{},"geoKey":"name","resolve":{"x":false,"y":false,"color":false,"opacity":false,"shape":false,"size":false}},"visId":"gw_qeTO","name":"San Diego Rental Fluctuations "},{"config":{"defaultAggregated":true,"geoms":["auto"],"coordSystem":"generic","limit":-1,"timezoneDisplayOffset":0},"encodings":{"dimensions":[{"fid":"RegionName","name":"RegionName","basename":"RegionName","semanticType":"nominal","analyticType":"dimension","offset":0},{"fid":"RegionType","name":"RegionType","basename":"RegionType","semanticType":"nominal","analyticType":"dimension","offset":0},{"fid":"StateName","name":"StateName","basename":"StateName","semanticType":"nominal","analyticType":"dimension","offset":0},{"fid":"date","name":"date","basename":"date","semanticType":"nominal","analyticType":"dimension","offset":0},{"fid":"Season","name":"Season","basename":"Season","semanticType":"nominal","analyticType":"dimension","offset":0},{"fid":"Political Leaning","name":"Political Leaning","basename":"Political Leaning","semanticType":"nominal","analyticType":"dimension","offset":0},{"fid":"gw_mea_key_fid","name":"Measure names","analyticType":"dimension","semanticType":"nominal"}],"measures":[{"fid":"index","name":"index","basename":"index","analyticType":"measure","semanticType":"quantitative","aggName":"sum","offset":0},{"fid":"RegionID","name":"RegionID","basename":"RegionID","analyticType":"measure","semanticType":"quantitative","aggName":"sum","offset":0},{"fid":"SizeRank","name":"SizeRank","basename":"SizeRank","analyticType":"measure","semanticType":"quantitative","aggName":"sum","offset":0},{"fid":"rent","name":"rent (USD)","basename":"rent","analyticType":"measure","semanticType":"quantitative","aggName":"sum","offset":0},{"fid":"gw_count_fid","name":"Row count","analyticType":"measure","semanticType":"quantitative","aggName":"sum","computed":true,"expression":{"op":"one","params":[],"as":"gw_count_fid"}},{"fid":"gw_mea_val_fid","name":"Measure values","analyticType":"measure","semanticType":"quantitative","aggName":"sum"}],"rows":[{"fid":"rent","name":"rent (USD)","basename":"rent","analyticType":"measure","semanticType":"quantitative","aggName":"sum","offset":0}],"columns":[{"fid":"date","name":"date","basename":"date","semanticType":"nominal","analyticType":"dimension","offset":0}],"color":[{"fid":"Season","name":"Season","basename":"Season","semanticType":"nominal","analyticType":"dimension","offset":0}],"opacity":[],"size":[],"shape":[],"radius":[],"theta":[],"longitude":[],"latitude":[],"geoId":[],"details":[],"filters":[{"fid":"RegionName","name":"RegionName","basename":"RegionName","semanticType":"nominal","analyticType":"dimension","offset":0,"rule":{"type":"one of","value":["San Francisco, CA"]}}],"text":[]},"layout":{"showActions":false,"showTableSummary":false,"stack":"stack","interactiveScale":false,"zeroScale":true,"size":{"mode":"full","width":320,"height":200},"format":{},"geoKey":"name","resolve":{"x":false,"y":false,"color":false,"opacity":false,"shape":false,"size":false}},"visId":"gw_TCPL","name":"San Francisco Rental Fluctuations"}],"chart_map":{},"workflow_list":[{"workflow":[{"type":"filter","filters":[{"fid":"RegionName","rule":{"type":"one of","value":["Los Angeles, CA"]}}]},{"type":"view","query":[{"op":"aggregate","groupBy":["date","Season"],"measures":[{"field":"rent","agg":"median","asFieldKey":"rent_median"}]}]}]},{"workflow":[{"type":"filter","filters":[{"fid":"RegionName","rule":{"type":"one of","value":["San Diego, CA"]}}]},{"type":"view","query":[{"op":"aggregate","groupBy":["date","Season"],"measures":[{"field":"rent","agg":"median","asFieldKey":"rent_median"}]}]}]},{"workflow":[{"type":"filter","filters":[{"fid":"RegionName","rule":{"type":"one of","value":["San Francisco, CA"]}}]},{"type":"view","query":[{"op":"aggregate","groupBy":["date","Season"],"measures":[{"field":"rent","agg":"sum","asFieldKey":"rent_sum"}]}]}]}],"version":"0.4.8.5"}"""
pyg.walk(zillow_ca_rental_df, spec=vis_spec)
logger.info('California Rental Data visualized')

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

def plot_correlation_heatmap(df, figsize=(10, 10), cmap='coolwarm', title='Correlation Heatmap'):
    # Encode categorical data
    df_encoded = pd.get_dummies(df)

    correlation_matrix = df_encoded.corr()
    
    plt.figure(figsize=figsize)

    sns.heatmap(correlation_matrix, cmap=cmap, center=0)

    plt.title(title)

    plt.show()


In [None]:
import pandas as pd
import seaborn as sns  # This is used in the function
import matplotlib.pyplot as plt  # This is used in the function

def plot_correlation_heatmap(df, figsize=(10, 10), cmap='coolwarm', title='Correlation Heatmap'):
    # Encode categorical data
    df_encoded = pd.get_dummies(df)


    correlation_matrix = df_encoded.corr()
    
    plt.figure(figsize=figsize)

    sns.heatmap(correlation_matrix, cmap=cmap, center=0,  fmt=".2f")  # Added annot and fmt

    plt.title(title)
    plt.show()

In [None]:
# Select the required columns from the dataframe
training_set1 = zillow_ca_rental_df

# Use the plot_correlation_heatmap function to plot the correlation heatmap
plot_correlation_heatmap(training_set1,(20,20), title="Training Set 1")
logger.info('training set 1 correlation heatmap plotted')

We can see that no given date has any correlation to rent so we can drop the dates from the training set.
Seasons also have no correlation to rent so we can drop the seasons from the training set.
SizeRank also has no correlation to rent so we can drop the SizeRank from the training set.

My only conclusion to this is either I need a better dataset or add new features to the training data.

    * I will create the first training set using this dataset and adding features such as unemployment rate, median income, taxes, jobs, population.


In [None]:
#TODO(1) write a function to fetch the unemployment rate for each region from the census data.
#TODO(2) write a function to fetch the median income for each region from the census data.
#TODO(3) write a function to fetch the median income for each region from the census data.
#TODO(4) write a function to fetch the median income for each region from the census data.  