# Calculating Annualised Returns for all Zipcodes in Zillow DataSet

In [None]:
import pandas as pd
import numpy as np
from datetime import datetime
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
import pickle

Retrieve the data from the zillow research data page (https://www.zillow.com/research/data/). Upon arrival at the page, navigate to the "Home Values" Section. From the "Data Type" dropdown select "ZHVI Single-Family Home Time Series" and from the "Geography" dropdown select "Zip Code". Then proceed to download the data.

In [None]:
#Importing the download dataset
zillow_df = pd.read_csv('Zip_Zhvi_SingleFamilyResidence.csv', encoding='ISO-8859-1')
zillow_df.head()

In the dataframe above, zipcodes are represented by the column "RegionName"

In [None]:
unique_zips = len(zillow_df['RegionName'].unique())

print(f'There are {unique_zips} unique zipcodes')

In [None]:
def melt_data(df):
    """Function to melt a dataframe in order to turn the dates from individual columns into a single column
    with multiple rows."""
    
    melted = pd.melt(df, 
                     id_vars=['RegionID', 'RegionName', 'City', 'State', 'Metro', 'CountyName', 'SizeRank'],
                     var_name='time') #melting only the different the date columns in a single date column.
    
    melted['time'] = pd.to_datetime(melted['time'], infer_datetime_format=True) 
    melted = melted.dropna(subset=['value'])
    melted = melted.groupby(['time', 'RegionName']).aggregate({'value':'mean'}) #grouping by time and zipcodes and 
    # aggregating the monthly data by the mean value of each zipcode at each given date.  
    zipcode_mean_monthly_return = melted.reset_index()

    return zipcode_mean_monthly_return

In [None]:
zillow = melt_data(zillow_df)

As mentioned above, we are will be subseting the dataframe to only include data from January 1st 2012 until January 1st 2018 as part of our training set.

In [None]:
years = range(2012, 2019)
year_month_list = [datetime.strptime(f'{year}-01-01', '%Y-%m-%d').date() for year in years] #creating a list which 
#will each year's first date. 
zillow_foy = zillow.loc[zillow['time'].isin(year_month_list)] #subseting the zillow dataframe to only include the 
#specified dates in year_month_list

In order to calculate annualised returns we have to perform the following calculations: 

    1. Calculate YoY return: y1 = (x2-x1)/x1 or y1 = (x2/x1)-1 
        1.1 x1 will represent each beginning year value 2012-01-01, 2013-01-01, ..., 2017-01-01
        1.2 x2 will represent each ending year value 2013-01-01, 2014-01-01, ..., 2018-01-01
    
    2. Calculate the compound return: (1+y1)(1+y2)...(1+yN)^1/number of periods (years)
        2.2 (1+y1) can be represented the following ways: (1+((x2-x1)/x1)) = ((1+(x2/x1)-1) = (x2/x1)
        2.3 Therefore, the compound becomes: (x2/x1)(x3/x2)...(xN/x(N-1))^1/number of periods (years)
    
    3. Calculating the annualised return: compound return-1
        3.1 ((1+y1)(1+y2)...(1+yN)^1/number of periods)-1 

In [None]:
#Calculating (x2/x1) for each year and for all the different zipcodes

zillow_foy['returns'] = zillow_foy['value'].div(zillow_foy.groupby('RegionName')['value'].shift(1))

In [None]:
#Dropping the NaNs that have appeared in the year 2012, since that is the first year of data and we can't 
#calculate the return from 2011 to 2012 (not in the dataset)

zillow_foy = zillow_foy.dropna(subset=['returns'])

NOTE: Some of the zipcodes don't have data dating back to 2012, therefore these NaN values are also dropped during the step above.

In [None]:
#Observing the YoY returns for the smallest zipcode in the dataset.

zillow_foy.loc[(zillow_foy['RegionName'] == 1001)]

In [None]:
def annualised_returns(df):
    """Function to calculate the annualised return for each zipcode from 2012-01-01 to 2018-01-01"""
    
    annualised_return = {} #Given that the result will be one figure, it is best to store it in a dictionary
    #where the key will be the zipcode and the value will be the annualised return.
    for zipcode in df['RegionName']:
        returns = list(zillow_foy.loc[(zillow_foy['RegionName'] == zipcode)]['returns'])
        
        tot_return = 1
        for r in returns:
            tot_return = tot_return * r 
        annualised = (tot_return ** (1/len(returns))) - 1
        annualised_return[zipcode] = annualised 
        
    #Turning the pandas dictionary with the annualised returns into a pandas dataframe. 
    zipcode_ann_returns_df = pd.DataFrame(list(zipcode_ann_returns.items()),
                                          columns=['RegionName', 'Ann_returns'])
    
    #Sorting the dataframe to show the zipcodes with the highest annualised returns in order to have a peak into
    #which zipcodes have performed the best for the timeframe selected.
    zipcode_ann_returns_df = zipcode_ann_returns_df.sort_values('Ann_returns', ascending=False)
    
    
    return annualised_return

In [None]:
zipcode_ann_returns = annualised_returns(zillow_foy)

In [None]:
#Visualising the distribution of annualised returns through a histogram

plt.figure(figsize=(11, 7))
plt.hist(zipcode_ann_returns['Ann_returns']);

In [None]:
over15pct = zipcode_ann_returns[zipcode_ann_returns['Ann_returns'] > 0.15]
print(f'The number of zipcodes that have yielded an annualised return of over 15% is {len(over15pct)}.')

In [None]:
#Saving the sorted list of annualised returns into a pickle.

with open('annualised_returns.pickle', 'wb') as f:
    pickle.dump(zipcode_ann_returns, f)