# Calculating Annualised Returns for all Zipcodes in the Train Data Set

In [None]:
import pandas as pd
import numpy as np
from datetime import datetime
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
import pickle
import functions as fn

In [None]:
#Running the data_retrieval_and_eda notebook in order to generate the train dataset.
%run ./data_retrieval_and_eda.ipynb

In [None]:
#Importing the dataset from the train.pickle file.
with open('train.pickle', 'rb') as f:
    train_df = pickle.load(f)

In [None]:
train_df.head()

In [None]:
unique_zips = len(train_df['RegionName'].unique())

print(f'There are {unique_zips} unique zipcodes')

In [None]:
#Dropping the columns that are unnecesary to calculate annualised returns.

train_df.drop(['RegionID', 'City', 'State', 'Metro', 'CountyName', 'SizeRank'], 
             axis=1, inplace=True)

In order to calculate annualised returns, we will be subseting the dataframe to only include values for the first month of the year from 2012 until 2018.

In [None]:
#returning the range for the years under analysis
years = range(2012, 2019)
#creating a list which will contain each year's first date. 
year_month_list = [datetime.strptime(f'{year}-01-01', '%Y-%m-%d').date() for year in years]
#subseting the train dataframe to only include the specified dates in year_month_list
train_foy = train_df.loc[train_df['time'].isin(year_month_list)] 

In order to calculate annualised returns we have to perform the following calculations: 

    1. Calculate YoY return: y1 = (x2-x1)/x1 or y1 = (x2/x1)-1 
        1.1 x1 will represent each beginning year value 2012-01-01, 2013-01-01, ..., 2017-01-01
        1.2 x2 will represent each ending year value 2013-01-01, 2014-01-01, ..., 2018-01-01
    
    2. Calculate the compound return: (1+y1)(1+y2)...(1+yN)^1/number of periods (years)
        2.2 (1+y1) can be represented the following ways: (1+((x2-x1)/x1)) = ((1+(x2/x1)-1) = (x2/x1)
        2.3 Therefore, the compound becomes: (x2/x1)(x3/x2)...(xN/x(N-1))^1/number of periods (years)
    
    3. Calculating the annualised return: compound return-1
        3.1 ((1+y1)(1+y2)...(1+yN)^1/number of periods)-1 

In [None]:
#Calculating (x2/x1) for each year and for all the different zipcodes.

train_foy['returns'] = train_foy['value'].div(train_foy.groupby('RegionName')['value'].shift(1))

Dropping the NaNs that have appeared in the year 2012, since that is the first year of data and we can't 
calculate the return from 2011 to 2012 (not in the dataset)

NOTE: Some of the zipcodes don't have data dating back to 2012, therefore these NaN values are also dropped during the step above.

In [None]:
#Dropping the NaNs.

train_foy = train_foy.dropna(subset=['returns'])

In [None]:
#Observing the YoY returns for the smallest zipcode in the dataset.

train_foy.loc[(train_foy['RegionName'] == 1001)]

In [None]:
zipcode_ann_returns = fn.annualised_returns(train_foy)

In [None]:
#Visualising the distribution of annualised returns through a histogram

plt.figure(figsize=(11, 7))
plt.hist(zipcode_ann_returns['Ann_returns']);

In [None]:
over15pct = zipcode_ann_returns[zipcode_ann_returns['Ann_returns'] > 0.15]
print(f'The number of zipcodes that have yielded an annualised return of over 15% is {len(over15pct)}.')

In [None]:
#Saving the sorted list of annualised returns into a pickle.

with open('annualised_returns.pickle', 'wb') as f:
    pickle.dump(zipcode_ann_returns, f)