In [1]:
import pandas as pd
from datetime import datetime
# disable scientific notation
pd.set_option('display.float_format', lambda x: '%.8f' % x)

In [2]:
# load the data, convert the date column to a datetime, then use that to generate a previous year's value
df = pd.read_csv("../data/results-suburb-aggregated-data.csv")[["SUBURB_NAME","DATE_MONTH","avg_land_value","property_count"]]
df["DATE_MONTH"] = df["DATE_MONTH"].apply(lambda s: datetime.strptime(s, "%Y-%m-%d"))
df["prev_date_month"] = df["DATE_MONTH"].apply(lambda dt: datetime(year=dt.year - 1, month = dt.month, day = dt.day))
df

Unnamed: 0,SUBURB_NAME,DATE_MONTH,avg_land_value,property_count,prev_date_month
0,BARANGAROO,2019-07-01,61507905.44444443,27,2018-07-01
1,BARANGAROO,2020-07-01,57216515.07407408,27,2019-07-01
2,BARANGAROO,2018-07-01,52233290.80000000,15,2017-07-01
3,WILPINJONG,2019-07-01,43932000.00000000,3,2018-07-01
4,WILPINJONG,2018-07-01,43386333.33333333,3,2017-07-01
...,...,...,...,...,...
22190,WOLLONDILLY,2016-07-01,160.00000000,1,2015-07-01
22191,WOLLONDILLY,2017-07-01,160.00000000,1,2016-07-01
22192,WOLLONDILLY,2018-07-01,160.00000000,1,2017-07-01
22193,WOLLONDILLY,2019-07-01,160.00000000,1,2018-07-01


In [3]:
# effectively a lookup table to later find the land value for any given SUBURB_NAME, date pair
value_dict = {(row.SUBURB_NAME, row.DATE_MONTH.strftime("%Y-%m-%d")):row.avg_land_value for _, row in df.iterrows()}

In [4]:
def value_from_row(row):
    try:
        return value_dict[(row.SUBURB_NAME, row.prev_date_month.strftime("%Y-%m-%d"))]
    except KeyError:
        return 0

df["prev_avg_land_value"] = df[["SUBURB_NAME","prev_date_month"]].apply(lambda row:value_from_row(row),axis=1)
# drop any suburbs,date_month pairs with no YoY change due to no prior date
df = df[df.prev_avg_land_value != 0]

In [5]:
df.head()

Unnamed: 0,SUBURB_NAME,DATE_MONTH,avg_land_value,property_count,prev_date_month,prev_avg_land_value
0,BARANGAROO,2019-07-01,61507905.44444443,27,2018-07-01,52233290.8
1,BARANGAROO,2020-07-01,57216515.07407408,27,2019-07-01,61507905.44444443
2,BARANGAROO,2018-07-01,52233290.8,15,2017-07-01,41589301.33333333
3,WILPINJONG,2019-07-01,43932000.0,3,2018-07-01,43386333.33333333
4,WILPINJONG,2018-07-01,43386333.33333333,3,2017-07-01,101000.0


In [6]:
# basic calculations to determine the YoY change, and the YoY change as a percentage
df["year_on_year_avg_land_value_change"] = df.apply(lambda row: row.avg_land_value - row.prev_avg_land_value, axis=1)
df["year_on_year_avg_land_value_change_perc"] = df.apply(
    lambda row: 
        (row.avg_land_value - row.prev_avg_land_value) / (row.prev_avg_land_value) * 100, 
    axis=1
)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [7]:
# quick check to see that the data is being populated as expected
df.groupby("SUBURB_NAME").head()

Unnamed: 0,SUBURB_NAME,DATE_MONTH,avg_land_value,property_count,prev_date_month,prev_avg_land_value,year_on_year_avg_land_value_change,year_on_year_avg_land_value_change_perc
0,BARANGAROO,2019-07-01,61507905.44444443,27,2018-07-01,52233290.80000000,9274614.64444444,17.75613694
1,BARANGAROO,2020-07-01,57216515.07407408,27,2019-07-01,61507905.44444443,-4291390.37037035,-6.97697367
2,BARANGAROO,2018-07-01,52233290.80000000,15,2017-07-01,41589301.33333333,10643989.46666667,25.59309516
3,WILPINJONG,2019-07-01,43932000.00000000,3,2018-07-01,43386333.33333333,545666.66666667,1.25769251
4,WILPINJONG,2018-07-01,43386333.33333333,3,2017-07-01,101000.00000000,43285333.33333333,42856.76567657
...,...,...,...,...,...,...,...,...
22188,JINDABYNE EAST,2017-07-01,330.00000000,1,2016-07-01,310.00000000,20.00000000,6.45161290
22191,WOLLONDILLY,2017-07-01,160.00000000,1,2016-07-01,160.00000000,0.00000000,0.00000000
22192,WOLLONDILLY,2018-07-01,160.00000000,1,2017-07-01,160.00000000,0.00000000,0.00000000
22193,WOLLONDILLY,2019-07-01,160.00000000,1,2018-07-01,160.00000000,0.00000000,0.00000000


In [8]:
# basic formatting to clean the data for a csv file
df_final = df[["SUBURB_NAME","DATE_MONTH","avg_land_value","property_count", "year_on_year_avg_land_value_change", "year_on_year_avg_land_value_change_perc"]]
df_final.rename(columns={
    "year_on_year_avg_land_value_change":"yoy_avg_land_value_change",
    "year_on_year_avg_land_value_change_perc":"yoy_avg_land_value_change_perc",
},inplace=True)
df_final.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 17745 entries, 0 to 22194
Data columns (total 6 columns):
SUBURB_NAME                       17745 non-null object
DATE_MONTH                        17745 non-null datetime64[ns]
avg_land_value                    17745 non-null float64
property_count                    17745 non-null int64
yoy_avg_land_value_change         17745 non-null float64
yoy_avg_land_value_change_perc    17745 non-null float64
dtypes: datetime64[ns](1), float64(3), int64(1), object(1)
memory usage: 970.4+ KB


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().rename(**kwargs)


In [9]:
# quick final check to see if any null values exist. 
# It simplifies any next stage in the data pipeline that can't do data cleaning
df_final.isnull().sum()

SUBURB_NAME                       0
DATE_MONTH                        0
avg_land_value                    0
property_count                    0
yoy_avg_land_value_change         0
yoy_avg_land_value_change_perc    0
dtype: int64

In [10]:
# make the csv. We don't need the index since it conveys not pipeline-relevant information
df_final.to_csv("../data/yoy-value-change-gb-suburb-and-month.csv", index=False)