# bootstrapping
# naeive forecast
# moving average for week vs year

# <center> Land Reg </center>

# <center> What areas that I can afford now, have posted the best returns for the past 10 years? </center>

## <center> Variables </center> 

<i>area</i> refers to the options for area selection present in the postcode data. <p>The options are listed in order of descending size:
        <ul>
            <li><i>county_name</i></li>
            <li><i>town/city</i></li>
            <li><i>ward_name</i></li>
            <li><i>postcode</i></li>
            <li><i>MSOA</i></li>
            <li><i>LSOA</i></li>
        </ul>

The further down the list you go, the more targeted the area. 

<i>target</i> is the metric to be compared in all future functions. <p>The options are:
        <ul>
            <li><i>pct_change</i> - Percentage Change</li>
            <li><i>price_diff</i> - Difference in Price year on year</li>
            <li><i>price</i> - House Price</li>
        </ul>
        
<i>year_min</i> & <i>year_max</i> are variables that set the range future functions will search and compare.<p>
<i>budget_min</i> & <i>budget_max</i> sets your budget for future functions

In [None]:
import os

target = 'pct_change'
area = 'MSOA'

year_min = 2000
year_max = 2020
budget_min = 200000
budget_max = 350000
property_type_dict = {"D": 'Detached', "F": "Flat", 'S':'Semi', 'T': "Terrace"}
regen_folder = os.getcwd().replace("Notebooks", "Text Files") 
csv_folder = os.getcwd().replace("Notebooks", "CSVs")
notebook_folder = os.getcwd()
%store target
%store area
%store year_min
%store year_max
%store budget_min
%store budget_max
%store csv_folder

In [None]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

import math

import os.path

from datetime import datetime as dt
from dateutil.relativedelta import relativedelta

import pandas as pd
pd.set_option("display.max_columns", 50)
pd.set_option("display.max_rows", 400000)
pd.set_option("display.width", 1000)

from matplotlib import pyplot as plt
import matplotlib.ticker as mtick
%matplotlib inline
plt.rcParams['figure.figsize'] = [20, 10]

import numpy as np

from sklearn.linear_model import LinearRegression, TheilSenRegressor
from sklearn.model_selection import train_test_split
from scipy import stats
from sklearn.metrics import r2_score

from sklearn.preprocessing import MinMaxScaler
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.model_selection import train_test_split

import seaborn as sns

import plotly as py
from plotly.subplots import make_subplots
import plotly.graph_objs as go
import plotly.express as px

font = {'family' : 'arial',
        'weight' : 'normal',
        'size'   : 18}

plt.rc('font', **font)



In [None]:
os.chdir(csv_folder)

file_list = ['all_properties_and_postcodes.csv','entire_period_filtered.csv', 'filtered_df.csv']

file_check = True

for file in file_list:
    if os.path.isfile(file):
        print (f'Found {file}')
    else:
        file_check = False
        print (f"Did not find {file}")
        break

In [None]:
if file_check:
    print('Reading all_properties_and_postcodes.csv')
    all_properties_and_postcodes = pd.read_csv('all_properties_and_postcodes.csv', low_memory=False)
    
    print('Reading filtered_df.csv')
    filtered_df = pd.read_csv('filtered_df.csv', low_memory=False)
    
    print('Reading entire_period_filtered.csv')
    entire_period_filtered = pd.read_csv('entire_period_filtered.csv', low_memory=False)
    
else:
    print('CSV not found. Running notebook.')
    
    %run '/Users/nathanprice/Dropbox/Python/Land-Reg-Data-Analysis/Read & Filter.ipynb'
    
    all_properties_and_postcodes = pd.read_csv('all_properties_and_postcodes.csv',
                                               low_memory=False)
    print ('properties_and_postcodes read')
    
    filtered_df = pd.read_csv('filtered_df.csv',
                              low_memory=False)
    print ('filtered_df read')
    
    entire_period_filtered = pd.read_csv('entire_period_filtered.csv',
                                         low_memory=False)
    print ('entire_period_filtered read \n')
    
# parse_dates within read_csv did not work    
print ('Recasting date strings to datetime')
all_properties_and_postcodes['date_sold'] = pd.to_datetime(all_properties_and_postcodes['date_sold'], format='%Y-%m-%d %H:%M')
filtered_df['date_sold'] = pd.to_datetime(filtered_df['date_sold'], format='%Y-%m-%d %H:%M')
entire_period_filtered['year_sold_min'] = pd.to_datetime(entire_period_filtered['year_sold_min'], format='%Y-%m-%d %H:%M')
entire_period_filtered['year_sold_max'] = pd.to_datetime(entire_period_filtered['year_sold_max'], format='%Y-%m-%d %H:%M')
print ('All dates are now datetime objects \n')

os.chdir(notebook_folder)

### within_budget_function

Due to the need to keep slicing various sections of the data, rather than rewrite this everytime I needed a new slice within the budget & years this does it in much less code.<br>Takes one argument, a dataframe to be filtered. <br> Returns the filtered dataframe. 

In [None]:
def within_budget_function(df):
    return df.loc[(df['price'] >= budget_min) & (df['price'] <= budget_max)]

### create_grid_cords

My solution to being able to automatically create the correct number of subplots for the amount of plots being created. <br>
Takes a single argument, top_n. top_n is the amount of total plots needed to be created. <br>
Returns a list of coordinates for rows = top_n / 2 and cols = 2

In [None]:
def create_grid_cords(top_n):
    """When given the top_n, returns a list of subplot coordinates for top_n amount of plots"""
    
    if top_n % 2 != 0:
        top_n += 1
        
    # Appends i to the list twice to create 1, 1, 2, 2
    n1 = []
    for i in range(1, (top_n // 2) + 1):
        n1.append(i)
        n1.append(i)
    print ("n1 is", n1)

    # math.floor rounds values down so 1.0 and 1.5 become 1. This gives us a list of 1, 1, 2, 2 etc.
#     n1 = [math.floor(x) for x in n1]

    # creates a single list of [1, 2] top_n many times
    n2 = list(range(1, 3)) * (top_n)
    print("n2 is", n2)
    # runs across both lists and just returns each iterative value
    cords_list = [[x, y] for (x, y) in zip(n1, n2)]

    return cords_list

In [None]:
create_grid_cords(6)

### all_properties_and_postcodes

<i>all_properties_and_postcodes</i> is a dataframe containing all the land registry data merged with the postcode data. 

In [None]:
all_properties_and_postcodes.head(5)

In [None]:
figure = sns.displot(data=all_properties_and_postcodes['price'], kde=True, bins=50, height=8, aspect=1.5)
plt.xlim(all_properties_and_postcodes['price'].min(), all_properties_and_postcodes['price'].max())
plt.title('Distribution of house prices within all_properties_and_postcodes'.title())
plt.show();

In [None]:
cdf = all_properties_and_postcodes.loc[all_properties_and_postcodes['year_sold'] > year_min].copy()
cdf = cdf.groupby('price')['address'].count().reset_index()
cdf = cdf.rename(columns={'address': 'count'})
cdf_total = cdf['count'].sum()
cdf['running_total'] = cdf['count'].cumsum()
cdf['percentage'] = cdf['running_total'] / cdf_total * 100

In [None]:
pct_min = float (cdf['percentage'].loc[cdf['price'] == budget_min])
pct_max = float (cdf['percentage'].loc[cdf['price'] == budget_max])

print (f"{int(pct_max - pct_min)}% of houses from {year_min} to {year_max} are in your budget of £{str(budget_min)[0:3]},{str(budget_min)[3:]} to £{str(budget_max)[0:3]},{str(budget_max)[3:]}")

cdf.plot(x='price', y='percentage', kind='line', figsize=(10, 8), legend=False, style='r')
plt.title('Cumulative Percentage of properties by value'.title(), y = 1.01, fontsize=20)
plt.ylabel('percntage of houses'.title(), labelpad=15)
plt.xlabel('House price'.title(), labelpad=15)
plt.xticks(rotation=25)
plt.axvline(x=budget_min, color='blue', linestyle='--')
plt.axvline(x=budget_max, color='darkblue', linestyle='--')
# plt.axhline(y=pct_min, color='g', linestyle='--')
# plt.axhline(y=pct_max, color='b', linestyle='--')
plt.legend(['Cumulative Percentage', 'Minimum Budget', 'Maximum Budget'])
plt.show();




### Percentage of Affordable Properties per County


In [None]:
area_total = all_properties_and_postcodes.groupby(['county_name']).agg(total=('price', 'count')).reset_index()

df_within_budget = within_budget_function(all_properties_and_postcodes)

df_within_budget = df_within_budget.groupby(['county_name']).agg(count=('price', 'count')).reset_index()

compare_counts = df_within_budget.merge(area_total, on='county_name')

compare_counts['percentage'] = compare_counts['count'] / compare_counts['total'] * 100

compare_counts = compare_counts.head(15)




In [None]:
percentage_figs = go.Figure()
percentage_figs = percentage_figs.add_trace(go.Pie(labels=compare_counts['county_name'], values=compare_counts['count'].loc[compare_counts['percentage'] >= 20], name='butt'))#,row=1, col=2)
percentage_figs = percentage_figs.update_layout(title_text='Affordable Houses as Percentage of total available'.title())
percentage_figs.show()

bar_figs = go.Figure()
bar_figs = bar_figs.add_trace(go.Bar(y=compare_counts['county_name'], x=compare_counts['percentage'].loc[compare_counts['percentage'] >= 25].sort_values(ascending=False), showlegend=False, orientation='h'))#, row=1, col=1)
bar_figs = bar_figs.update_layout(title_text='Affordable Houses as Percentage total houses within each Ward'.title())


bar_figs.show()

### filtered_df

In [144]:
filtered_df.head(5)

Unnamed: 0.1,Unnamed: 0,index,price,postcode,property_type,town/city,date_sold,year_sold,address,new,freehold,county_name,ward_name,LSOA,MSOA,Longitude,Latitude
3,3,13728238,250000,AL10 0AB,T,HATFIELD,2014-05-02,2014,al100ablongmead039,0,1,Hertfordshire,Hatfield Central,WelwynHatfield010F,WelwynHatfield010,-0.223341,51.773122
0,0,14003432,245000,AL10 0AB,T,HATFIELD,2014-12-12,2014,al100ablongmead025,0,1,Hertfordshire,Hatfield Central,WelwynHatfield010F,WelwynHatfield010,-0.223341,51.773122
2,2,15910737,348000,AL10 0AB,T,HATFIELD,2016-03-17,2016,al100ablongmead039,0,1,Hertfordshire,Hatfield Central,WelwynHatfield010F,WelwynHatfield010,-0.223341,51.773122
1,1,15560062,335000,AL10 0AB,T,HATFIELD,2016-08-26,2016,al100ablongmead025,0,1,Hertfordshire,Hatfield Central,WelwynHatfield010F,WelwynHatfield010,-0.223341,51.773122
4,4,17966021,285000,AL10 0AD,T,HATFIELD,2019-08-15,2019,al100adlongmead063,0,1,Hertfordshire,Hatfield Central,WelwynHatfield010B,WelwynHatfield010,-0.218732,51.773475


<b>filtered_df</b> is a dataframe containing a subsection of properties deemed affordable. 

<ul>
    <li>Only contains properties that were within budget for the past 5 years </li> 
    <li>Only contains properties that were sold more than once</li>
    <li>Only contains properties that have a single property type</li>
    <li>Does a final sweep to ensure all properties are within a range of the budget </li>

In [None]:
figure = sns.displot(data=filtered_df['price'], kde=True, bins=50, height=8, aspect=1.5)
plt.xlim(filtered_df['price'].min(), filtered_df['price'].max())
plt.title('Distribution of house prices within filtered_df'.title())
plt.show();

### entire_period_filtered

In [None]:
entire_period_filtered.head(5)

<b>entire_period_filtered</b> is a dataframe that has a single entry for each address. <br> It takes the first sale and the last sale and makes calculations between the two. But it does not have access to year-on-year data. 

<ul>
    <li><b>year_diff</b></li> is the difference from the year of the first sale, to the year of the last sale.
    <li><b>price_diff</b></li> is the difference in price from the first sale to the last sale 
    <li><b>price_change_per_year</b></li> is the average price change for each year.
    <li><b>total_pct_change</b></li> is the absolute percantage change from the first sale to the last.
    <li><b>pct_change_per_year</b></li> is the average percentage change for each year.
</ul>

In [None]:
figure = sns.displot(data=entire_period_filtered['max_price'], kde=True, bins=50, height=8, aspect=1.5)
plt.xlim(entire_period_filtered['max_price'].min(), entire_period_filtered['max_price'].max())
plt.title('Distribution of house prices within entire_period_filtered'.title())
plt.show();

### find_best_area

In [None]:
def find_best_area(dataset, target=target, area=area, n=15, min_n = 100, year_min=None, year_max=None):

    list_of_areas_with_n_less_than_n = []

    for location in list(dataset[area].unique()):
        if len (dataset.loc[dataset[area] == location]) <= min_n:
            list_of_areas_with_n_less_than_n.append(location)
    
    dataset = dataset.loc[~dataset[area].isin(list_of_areas_with_n_less_than_n)]

    group = dataset.groupby([area, 'county_name'])[target].median().sort_values(ascending=False)
    
    group = group.reset_index().head(n)
    
    area_names = list (group[area].unique())
    
    entire_period_best_area_dataframe = dataset.loc[dataset[area].isin(area_names)]
   
    return group, area_names, entire_period_best_area_dataframe

<b>find_best_area</b> is a function to find the best n areas. 
<p>The areas are ordered by the median increase of the <b>target</b> variable.

It will only use entries where there are more than <b>n</b> sales. 

It returns three objects:<br>
<ul>    
    <li>A dataframe that contains the median of the target dataset grouped by the specified area and county name.</li>
    <li>A list of the best area names</li>
    <li>A dataframe similar to <b>entire_period_filtered</b>, but only containing entries for the top n areas as found by the function.</li>
    </ul>

### Top N Areas

Listed below are the best <b>area</b> names as created by the <b>find_best_area<b> function. 

In [None]:
top_n = 8

In [None]:
best_area, best_area_names, entire_period_best_area_dataframe = find_best_area(entire_period_filtered,
                                                                               area=area,
                                                                               target=target)

best_area_names[:top_n]

### Median Percentage Change by County / MSOA

On the left is median percentage change by county, and on the right is MSOA. 

In [None]:
best_area_counties_median = entire_period_best_area_dataframe.groupby('county_name')[target].median().reset_index()

fig, axes = plt.subplots(nrows=1, ncols=2)

fig_1 = best_area_counties_median.plot(kind='bar', x='county_name', y=target, ax=axes[0], rot=75, colormap='coolwarm', legend=False, title='County Name')
fig_2 = best_area.plot(kind='bar', x=area, y=target, ax=axes[1], rot=75, colormap='coolwarm', legend=False, title='MSOA')

fig.suptitle(f'Median total {target} for County & {area} from {year_min} to {year_max}'.replace("_", " ").title())

fig_1.set_ylabel(target)
fig_2.set_ylabel(target)

fig_1.yaxis.set_major_formatter(mtick.PercentFormatter())
fig_2.yaxis.set_major_formatter(mtick.PercentFormatter())


plt.show();


### yearly_dataframe

<b>yearly_dataframe</b> creates a dataframe that allows you to track the price changes by individual address year on year. 

It should be called on an un-modified/ungrouped dataset such as <b>filtered_df</b>.

It returns a dataframe containing all indiviudal sales with the following new columns:
<ul>
    <li><b>prev_price</b></li> is the last recorded sale price
    <li><b>price_diff</b></li> is the difference between the current sale and the last
    <li><b>pct_change</b></li> is the percentage change between the current sale and the last
    <li><b>prev_year</b></li> is the year of the previous sale
    <li><b>year_diff</b></li> is the time difference between the current sale and the last



In [None]:
def yearly_dataframe(dataframe):
    # Creates a yearly change groupby object that then gets reset into a dataframe
    yearly_group = dataframe.groupby(['address','property_type', 'town/city', 'postcode', 'county_name', 'ward_name', 'MSOA', 'LSOA', 'year_sold', 'date_sold', 'price']).median()
    yearly_group = yearly_group.reset_index()

    # Creates a new column year-date sold
    yearly_group['year_month_sold'] = yearly_group['date_sold'].apply(lambda x: x.strftime("%Y-%m"))
    yearly_group['year_month_sold'] = pd.to_datetime(yearly_group['year_month_sold'], format='%Y-%m')
    
    # Creates columns
    yearly_group['prev_price'] = yearly_group.groupby('address')['price'].shift()
    yearly_group['price_diff'] = yearly_group['price'] - yearly_group['prev_price']
    yearly_group['pct_change'] = (((yearly_group['price'] - yearly_group['prev_price']) / yearly_group['prev_price']) * 100)
    yearly_group['prev_year'] = yearly_group.groupby('address')['year_sold'].shift()
    yearly_group['year_diff'] = yearly_group['year_sold'] - yearly_group['prev_year']

    # Filters NaN values 
    yearly_group['prev_price'] = np.where(np.isnan(yearly_group['prev_price']), 0, yearly_group['prev_price'])
    yearly_group['price_diff'] = np.where(np.isnan(yearly_group['price_diff']), 0, yearly_group['price_diff'])
    yearly_group['prev_year'] = np.where(np.isnan(yearly_group['prev_year']), 0, yearly_group['prev_year'])
    yearly_group['year_diff'] = np.where(np.isnan(yearly_group['year_diff']), 0, yearly_group['year_diff'])
    yearly_group['pct_change'] = np.where(np.isnan(yearly_group['pct_change']), 0, yearly_group['pct_change'])

    # Removes entries with a zero price diff or zero year diff
    yearly_group = yearly_group.loc[(yearly_group['price_diff'] != 0) & (yearly_group['year_diff'] >= 1)]
    
    
    
    return yearly_group

### best_areas_yearly

In [None]:
best_areas_yearly = yearly_dataframe(filtered_df[filtered_df[area].isin(best_area_names)])
best_areas_yearly.head(1)

<b>best_areas_yearly</b> is a dataframe containing data for each sale for the best areas as defined by the <b>find_best_area</b> function. 

### filtered_yearly_dataframe

In [None]:
filtered_yearly_dataframe = yearly_dataframe(filtered_df)
filtered_yearly_dataframe.head(1)

<b>filtered_yearly_dataframe</b> is a dataframe containing yearly data for <b>only affordable</b> addresses across the entire country. <br>

### all_yearly_dataframe

In [101]:
all_yearly_dataframe = yearly_dataframe(all_properties_and_postcodes)
all_yearly_dataframe.head(1)

Unnamed: 0.1,address,property_type,town/city,postcode,county_name,ward_name,MSOA,LSOA,year_sold,date_sold,price,Unnamed: 0,new,freehold,Longitude,Latitude,year_month_sold,prev_price,price_diff,pct_change,prev_year,year_diff
2,al100ablongmead017,T,HATFIELD,AL10 0AB,Hertfordshire,Hatfield Central,WelwynHatfield010,WelwynHatfield010F,2019,2019-10-11,360000,18617676.0,0.0,1.0,-0.223341,51.773122,2019-10-01,240000.0,120000.0,50.0,2014.0,5.0


<b>all_yearly_dataframe</b> is a dataframe containing yearly data for <b>all</b> addresses across the entire country. <br>

### National Median

In [None]:
national_affordable_medians = entire_period_filtered.groupby('county_name')[['pct_change', 'price_diff', 'year_diff', 'pct_change_per_year']].median().median().reset_index()


national_affordable_medians = national_affordable_medians.rename(columns={'index': 'metric', 0: 'national_affordable_median'})
national_affordable_medians

<b>national_affordable_medians</b> is a dataframe containing the median values for total percentage change, difference in first and last price, difference in first and last year sold, and rate of pct change for properties deemed 'affordable' across the country. 

### performed_better
<b>performed_better</b> is a function to compare the median values of specific areas against the national median for affordable properties. 

You can pass the location as a list and it will check against all those locations. The same can be done with addresses with the <b>address</b> argument. You must always pass the locations / addresses as a list. 

It returns a dataframe that is the <b>national_affordable_medians</b> dataframe merged with the comparison data. 
   

In [None]:
def performed_better(locations,
                     dataframe=entire_period_filtered,
                     area=area, 
                     address=None):
    
    comparison_df = national_affordable_medians.copy()
    
    if address == None:
        area_of_interest = dataframe.loc[dataframe[area].isin(locations)]
    else:
        area_of_interest = dataframe.loc[dataframe['address'].isin(address)]
        area = 'address'
        locations = address
        
    for location in locations:
        
        location_of_interest = area_of_interest.loc[area_of_interest[area] == location].groupby(area)[['pct_change', 'price_diff', 'year_diff', 'pct_change_per_year']].median().reset_index()
        comparison_df[location] = [location_of_interest['pct_change'].item(), location_of_interest['price_diff'].item(), location_of_interest['year_diff'].item(), location_of_interest['pct_change_per_year'].item()]
    
    return comparison_df

### Comparison Against National Median

The table below shows the median values for top N performing areas against the national median. 

<ul>
    <li><b>pct_change</b></li>Total percentage change over the entire period
    <li><b>price_diff</b></li>Median difference from the first sale to the last
    <li><b>year_diff</b></li>The median years between house sales
    <li><b>pct_change_per_year</b></li>Median percentage change each year for the entire period

In [None]:
comparison_df = performed_better(locations=best_area_names)
comparison_df = comparison_df.set_index('metric')

comparison_df




### rolling_avg_subplots_for_area

<b>rolling_avg_subplots_for_area</b> creates scatter plots and Rolling Average values for areas passed to the function.

It returns two objects:<br>
<ul> 
    <b><li>figure</li></b> A pyplot figure containing the subplots.
    <b><li>n_table</li></b> A dataframe containing the number of sales for each area.

In [135]:
def rolling_avg_subplots_for_area(dataframe=best_areas_yearly, target=target, area=area, rolling_val=0.5, top_n=4, line=False):


    year_min = dataframe['year_sold'].min()
    year_max = dataframe['year_sold'].max()
    
    
    n_table = pd.DataFrame(columns=[], index=['n'])
    
    area_list = list(dataframe[area].unique())[:top_n]
    
    # Created function to create appropriate grid cordinates given the top_n number of plots needed to be created
    cords_list = create_grid_cords(top_n)

#     Creates a figure and axes object.     
    fig = make_subplots(rows=top_n // 2, cols=2, subplot_titles=area_list, shared_xaxes='all', shared_yaxes=False)

    
    for cord, location in zip(cords_list, area_list):
        df = dataframe.loc[(dataframe[area] == location) 
                              & (dataframe['year_sold'] >= year_min) 
                              & (dataframe['year_sold'] <= year_max)].copy().sort_values('date_sold')
        
        df = df.groupby('year_month_sold')[target].median().reset_index()
        
        fig.add_trace(go.Scatter(x=df['year_month_sold'], y=df[target], mode='markers', name=target.replace("_", " ").title()),
                     row=cord[0], col=cord[1])
        
        df['rolling_avg_' + str(rolling_val)] = df[target].rolling(rolling_val).mean()

        fig.add_trace(go.Scatter(x=df['year_month_sold'], y=df['rolling_avg_' + str(rolling_val)], mode='lines', name='Rolling Average ' + str(rolling_val)),
             row=cord[0], col=cord[1])
        
#         df['ema'] = df[target].ewm(alpha=0.2, adjust=False).mean()
#         fig.add_trace(go.Scatter(x=df['year_month_sold'], y=df['ema'], mode='lines'), row=cord[0], col=cord[1])

        if line == True:
        
            fig.add_shape(type='line',
                         x0='2019-09-01',
                         x1='2019-09-01',
                         y0=df[target].min(),
                         y1=df[target].max(),
                         line=dict(
                         color='Red', 
                         width=3, 
                         dash='dot'), row=cord[0], col=cord[1])
        
        n_table[location] = len(df)

        
    fig.update_layout(title_text=f"Median {target} grouped by Month<br>Rolling Average: {rolling_val}".title().replace("_", " "))
    fig.update_layout(showlegend=True)
    fig.update_xaxes(nticks=len(dataframe['year_sold'].unique()) // 2)
    
    return fig, n_table

In [124]:
best_areas_figure, n_table = rolling_avg_subplots_for_area(dataframe=best_areas_yearly,
                                               target='price',
                                              rolling_val=3, top_n=2)

n_table
best_areas_figure


n1 is [1, 1]
n2 is [1, 2, 1, 2]


Unnamed: 0,Bromley008,Bristol041
n,97,79


# <i>plot moving and exp average on the same chart for various time lengths</i>
# <i>95th and 5th percentile</i>

### poly_subplots_for_area

<b>poly_subplots_for_area</b> creates scatter plots and a polynomial regression for areas passed to the function.

Currently you have to change the <b>rows</b> and <b>cols</b> for the subplot manually in the function to match the <b>top_n</b> areas. 

It returns two objects:<br>
<ul> 
    <b><li>figure</li></b> A pyplot figure containing the subplots.
    <b><li>n_table</li></b> A dataframe containing the number of sales for each area.

In [None]:
def poly_subplots_for_area(dataframe=best_areas_yearly, target=target, area=area, top_n=6):


    year_min = dataframe['year_sold'].min()
    year_max = dataframe['year_sold'].max()
    
    
    n_table = pd.DataFrame(columns=[], index=['n'])
    
    area_list = list(dataframe[area].unique())
    
    # Created function to create appropriate grid cordinates given the top_n number of plots needed to be created
    cords_list = create_grid_cords(top_n)

#     Creates a figure and axes object.     
    fig = make_subplots(rows=(top_n // 2),
                        cols=2,
                        subplot_titles=area_list,
                        shared_xaxes='all',
                        print_grid=False)

    # Iterates through the subplot grids, and the locations named in area_list
    for cord, location in zip(cords_list, area_list):

        #creates temporary dataframe containing only matched locations, and areas within the correct year range
        df = dataframe.loc[(dataframe[area] == location) 
                              & (dataframe['year_sold'] >= year_min) 
                              & (dataframe['year_sold'] <= year_max)].copy()
        
        #groups the df by the year_month sold creating a single median entry for each month of the year
        df = df.groupby('year_month_sold')[target].median().reset_index()
        
        #creates a new column converting the year_month_sold datetime value to a ordinal value to allow model creation
        df['ord_dates'] = df['year_month_sold'].map(dt.toordinal)
        
        #variable creation for readability later
        x_vals = df['ord_dates']
        y_vals = df[target]
        
        #Adds a scatter trace to teh specified row, col subplot of all monthly sale information
        fig.add_trace(go.Scatter(x=df['year_month_sold'],
                                 y=y_vals,
                                 mode='markers'),
                     row=cord[0], col=cord[1])

        #variable creation to values in the coming for loop
        best_score = 0
        best_i = 0
        #iterates over a specified range to try different polynomial values to find the most accurate line
        for i in range(20):
            
                    
            #surpresses numpy warnings about model being incorrectly fit as it iterates over all posibilties
            np.warnings.filterwarnings('ignore')
            
            #stolen from stackoverflow, not 100% sure what's happening
            #From what I can tell it's similar to scikit and is fitting the data to a model with a polynomial, i
            mymodel = np.poly1d(np.polyfit(x_vals,
                                          y_vals,
                                          i))
            #creates an array of x values
            myline = np.linspace(x_vals[0],
                                 x_vals[len(x_vals) - 1])
            
            #returns r2 score of the mode
            score = r2_score(y_vals, mymodel(x_vals))
            
            if score > best_score:
                best_score = score
                best_i = i
        
        #refits the model with the best_i value found by the above for loop
        mymodel = np.poly1d(np.polyfit(x_vals, y_vals, best_i))
        
        #this is key
        #creates a new series of datetime values from the ordinal date values used in the numpy model
        my_line_dates = pd.Series(myline.astype(int)).map(dt.fromordinal)
        
        #adds the polynomial line to the figure.
        fig.add_trace(go.Scatter(x = my_line_dates,
                                 y = mymodel(myline),
                                 mode='lines'),
                     row=cord[0], col=cord[1])       

        
        n_table[location] = len(df)

    fig.update_layout(title_text=f"{target} grouped by Month<br>With polynomial regression lines".title().replace("_", " "))
    fig.update_layout(showlegend=False)
    fig.update_xaxes(nticks=20)

    
    return fig, n_table

In [142]:
best_areas_figure, n_table = poly_subplots_for_area(dataframe=best_areas_yearly,
                                               target=target, 
                                                    top_n=4)

n_table
best_areas_figure




n1 is [1, 1, 2, 2]
n2 is [1, 2, 1, 2, 1, 2, 1, 2]


Unnamed: 0,Bromley008,Bristol041,Bristol037,Harlow005
n,97,79,71,114


### Map of Best Affordable Areas

In [None]:
fig = px.scatter_mapbox(best_areas_yearly, lat="Latitude", lon="Longitude", hover_name="address", hover_data=["price", "ward_name", 'MSOA', 'postcode'],
                        color=area, size='price',
                  color_continuous_scale=px.colors.cyclical.IceFire, zoom=6, height=1000, width=1000)
fig.update_layout(mapbox_style="carto-positron")
fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})
fig.show();

### create_process_control_chart

Outputs and displays a process control chart. 
<ul>
    <li>You should already have filtered the passed dataframe to only contain the subsection / area of interest, and time period / grouping of interest. </li>
    <li>n_group is how many data points each point on the scatter will represent. </li>
    <li>x_name is the name of the column being passed as the x axis. x_name has to be a datetime format.</li>
    <li>y_name is the name of the column being passed as the y axis. Has to be numerical. </li>
    <li>time interval is a time series / date offset value from the pandas datetime module. Defailt is 'BYS' - beginning of calendar year. </ul>
    <li>You can specify a custom title by passing it as a string to the <b>title=</b> arugment.</li>
</ul>

In [None]:
def plotly_create_process_control_chart(dataframe, n_group, x_name, y_name, time_interval='BYS', title=False):
    
    # Organises data in chronological order 
    dataframe = dataframe.sort_values(x_name)

    # Finds first and last date in the data
    first_date = dataframe[x_name].min()
    last_date = dataframe[x_name].max()

    # Creates dataframe to be filled later
    final_dataframe = pd.DataFrame(columns=[x_name, y_name])
    
    # For loop to create n_group length slices within the data
    for i in range(n_group, len(dataframe), n_group):
        df_slice = dataframe.iloc[i - n_group: i]

        y_val = df_slice[y_name].median()
        x_val = df_slice[x_name].min()

        vals = pd.DataFrame([[x_val, y_val]], columns=list(final_dataframe.columns))

        final_dataframe = final_dataframe.append(vals)

    # Pandas date_range function used to pass dates to the for loop to create LCL, UCL and central lines
    date_range = pd.date_range(start=first_date,
                               end=last_date + relativedelta(years=1),
                               freq=time_interval, closed=None)

    # Creates plotly graph object     
    fig = go.Figure()
    
    # Adds a scatter trace to the graph object of the slices we created
    fig.add_trace(go.Scatter(x=final_dataframe[x_name], y=final_dataframe[y_name], mode='lines+markers', name='Rolling Median House Price'))
    
    # Logic to loop through the date ranges to find the boundries for each of the lines
    for n in range(1, len(list(date_range)), 1):
        
        # Try, except here to catch the index error.  
        try:
            
            # Create a dataframe containing only information within the date boundries. 
            date_slice = dataframe[y_name].loc[(dataframe[x_name] > date_range[n - 1]) &
                                         (dataframe[x_name] < date_range[n])]
            
            # Mean, STD, UCL and LCL
            mean = date_slice.mean()
            std = date_slice.std()
            UCL = mean + (std * 3)
            LCL = mean - (std * 3)
            
            # 
            # Adding the Central Line 
            fig.add_shape(type='line',
                x0=date_range[n-1],
                y0=mean,
                x1=date_range[n],
                y1=mean,
                line=dict(
                    color="Red",
                    width=1))

            # Adding the Upper control line
            fig.add_shape(type='line',
                x0=date_range[n-1],
                y0=UCL,
                x1=date_range[n],
                y1=UCL,
                line=dict(
                    color="Black",
                    width=3,
                    dash="dot"))

            # Adding the Lower control line
            fig.add_shape(type='line',
                x0=date_range[n-1],
                y0=LCL,
                x1=date_range[n],
                y1=LCL,
                line=dict(
                    color="Black",
                    width=3,
                    dash="dot"), name='LCL')
        
        except IndexError:
            continue 

    fig.update_xaxes(nticks=20)
    
    if title:
        fig.update_layout(title=title.replace("_", " ").title())
    else:
        fig.update_layout(title=f'{y_name} by {x_name}'.replace("_", " ").title())
    return fig

In [None]:
plotly_create_process_control_chart(dataframe=best_areas_yearly, 
                             n_group=25, 
                             x_name='date_sold', 
                             y_name='price')

In [None]:
plotly_create_process_control_chart(dataframe=best_areas_yearly.groupby('year_month_sold')['price'].median().reset_index(), 
                             n_group=1, 
                             x_name='year_month_sold', 
                             y_name='price',
                             title='Price by Year Month Sold for best_areas_yearly')

In [None]:
def new_compare_scatter(dataframe, x_name, y_name, z_name, rolling_type='ema', ema_val=0.5, rolling_val=10, title=False):
    
    df = dataframe.groupby([x_name, z_name])[y_name].median().reset_index()
    
    figure = go.Figure()
    figure.add_trace(go.Scatter(x=df[x_name],
                                y=df[y_name],
                                mode='markers',
                                opacity=0.2,
                                name='Sale Price'))
    
    for z in df[z_name].unique():
        df_by_type = df.loc[df[z_name] == z].copy()
        
        if rolling_type == 'ema':
            df_by_type['EMA_' + str(ema_val)] = df_by_type[y_name].ewm(alpha=ema_val, adjust=False).mean()


            figure.add_trace(go.Scatter(x=df_by_type[x_name], 
                                        y=df_by_type['EMA_' + str(ema_val)], 
                                        name=property_type_dict[z],
                                        mode='markers+lines'))

            if title:
                figure.update_layout(title=title.title())
            else:
                figure.update_layout(title=f'{y_name} by {x_name} for {z_name} with EMA - {ema_val}'.replace("_", " ").title())


        
        if rolling_type == 'avg':
            df_by_type['rolling'] = df_by_type[y_name].rolling(rolling_val).mean()


            figure.add_trace(go.Scatter(x=df_by_type[x_name], 
                                        y=df_by_type['rolling'], 
                                        name=property_type_dict[z],
                                        mode='markers+lines'))

        if title:
            figure.update_layout(title=title.title())
        else:
            figure.update_layout(title=f'{y_name} by {x_name}<br>Split by {z_name}<br>Rolling Average - {rolling_val}'.replace("_", " ").title()) 

        
    return figure

In [None]:
new_compare_scatter(best_areas_yearly,
                    x_name='date_sold',
                    y_name='price',
                    z_name='property_type',
                    rolling_type='avg',
                    rolling_val=10)

In [None]:
new_compare_scatter(best_areas_yearly,
                    x_name='date_sold',
                    y_name='price',
                    z_name='property_type',
                    rolling_type='ema',
                    ema_val=0.1)

### Regeneration Areas

In September 2019 the government announced regeneration projects for a number of towns.

<b>df_all_regen_towns</b> is a dataframe with only entries present on the following lists:
<ul>
    <b><li>regen_towns</li></b>Is a list of ward names of the towns due for regeneration taken from a government press release. 
    <b><li>guardian_regen_towns</li></b>Is a list of the actual town names themselves, taken from a Guardian article.
</ul>

In [102]:
os.chdir(regen_folder)

with open('regen.txt') as f:
    regen_towns = f.read()
    regen_towns = list(regen_towns.split("\n"))


with open('guardian_regen_edit.txt') as f:
    guardian_regen_towns = f.read()
    guardian_regen_towns = list(guardian_regen_towns.split(","))

guardian_regen_towns = [x.lstrip() for x in guardian_regen_towns]

df_regen_towns = all_yearly_dataframe.loc[all_yearly_dataframe['ward_name'].isin(regen_towns)]

df_guardian_regen_towns = all_yearly_dataframe.loc[all_yearly_dataframe['town/city'].isin(guardian_regen_towns)]

df_all_regen_towns = df_regen_towns.append(df_guardian_regen_towns)

os.chdir(notebook_folder)

### df_yearly_regen_towns

In order to better see the effects of the 2019 announcement, the following line creates a slice of the dataframe containing the entire period. 

In [139]:
df_regen_towns_from_2017 = df_all_regen_towns.loc[df_all_regen_towns['year_sold'] >= 2018]

df_regen_towns_from_2017.head(5)

Unnamed: 0.1,address,property_type,town/city,postcode,county_name,ward_name,MSOA,LSOA,year_sold,date_sold,price,Unnamed: 0,new,freehold,Longitude,Latitude,year_month_sold,prev_price,price_diff,pct_change,prev_year,year_diff
2657018,cf481hjgethinplace07,T,MERTHYR TYDFIL,CF48 1HJ,(pseudo) Wales,Plymouth,MerthyrTydfil005,MerthyrTydfil005A,2019,2019-11-22,103500,17951329.0,0.0,1.0,-3.363638,51.72212,2019-11-01,82000.0,21500.0,26.219512,2013.0,6.0
2657625,cf481pxgethinstreet06,T,MERTHYR TYDFIL,CF48 1PX,(pseudo) Wales,Plymouth,MerthyrTydfil005,MerthyrTydfil005A,2019,2019-11-20,97000,18156715.0,0.0,1.0,-3.364365,51.722723,2019-11-01,113000.0,-16000.0,-14.159292,2008.0,11.0
2657882,cf481sxchapelstreet7waunwylltcourt,T,MERTHYR TYDFIL,CF48 1SX,(pseudo) Wales,Plymouth,MerthyrTydfil005,MerthyrTydfil005A,2019,2019-12-18,120000,18561449.0,0.0,1.0,-3.364434,51.724147,2019-12-01,100000.0,20000.0,20.0,2016.0,3.0
2658097,cf481yxanthonygrove052,S,MERTHYR TYDFIL,CF48 1YX,(pseudo) Wales,Plymouth,MerthyrTydfil005,MerthyrTydfil005A,2019,2019-06-07,120000,18661533.0,0.0,1.0,-3.371275,51.725726,2019-06-01,82000.0,38000.0,46.341463,2005.0,14.0
2659420,cf484aasliproad0wernlascottage,D,MERTHYR TYDFIL,CF48 4AA,(pseudo) Wales,Plymouth,MerthyrTydfil006,MerthyrTydfil006E,2019,2019-07-12,450000,18387915.0,0.0,1.0,-3.359256,51.729508,2019-07-01,225000.0,225000.0,100.0,2009.0,10.0


In [140]:
regen_fig, n_table = rolling_avg_subplots_for_area(dataframe=df_regen_towns_from_2017, area='ward_name',
                                    target='price',
                                    rolling_val=2,
                                    top_n=6,
                                    line=True)

n_table
regen_fig

n1 is [1, 1, 2, 2, 3, 3]
n2 is [1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2]


Unnamed: 0,Plymouth,Barrow,Northumberland,North East,Warwick,Hambleton
n,30,18,31,31,31,29


In [141]:
regen_fig, n_table = rolling_avg_subplots_for_area(dataframe=df_regen_towns_from_2017,
                                    area='ward_name',
                                    target=target,
                                    rolling_val=2,
                                    top_n=6,
                                    line=True)

n_table
regen_fig



n1 is [1, 1, 2, 2, 3, 3]
n2 is [1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2]


Unnamed: 0,Plymouth,Barrow,Northumberland,North East,Warwick,Hambleton
n,30,18,31,31,31,29


# all_regen_plot

Similar function as area_sub_plot as above, but plots all lines on the same figure. 

In [None]:
def all_regen_plot(dataframe=df_regen_towns_from_2017, target=target, area=area, ema_val=0.5, groupby_option='date_sold'):


    year_min = dataframe['year_sold'].min()
    year_max = dataframe['year_sold'].max()


    n_table = pd.DataFrame(columns=[], index=['n'])

    area_list = list(dataframe[area].unique())

    #     Creates a figure and axes object.     
    fig = make_subplots(rows=1, cols=1)


    for location in area_list:
        df = dataframe.loc[(dataframe[area] == location) 
                              & (dataframe['year_sold'] >= year_min) 
                              & (dataframe['year_sold'] <= year_max)].copy()

        df = df.groupby(groupby_option)[target].median().reset_index()

        fig.add_trace(go.Scatter(x=df[groupby_option], y=df[target], mode='markers', name='', opacity=0.05))

        df['EMA_' + str(ema_val)] = df[target].ewm(alpha=ema_val, adjust=False).mean()

        fig.add_trace(go.Scatter(x=df[groupby_option], y=df['EMA_' + str(ema_val)], mode='lines+markers', name=location))




        n_table[location] = len(df)
    
    fig.update_layout(title_text=f"Comparison of {target}<br>{area}<br>EMA - {ema_val}".title().replace("_", " "))
    fig.update_layout(showlegend=True)
    fig.update_xaxes(nticks=15)
    fig.update_xaxes(matches='x')


    return fig, n_table

In [None]:
regen_fig, n_table = all_regen_plot(dataframe=df_regen_towns_from_2017, area='ward_name', target='pct_change', ema_val=0.2)

n_table
regen_fig

In [None]:
regen_fig, n_table = all_regen_plot(dataframe=df_regen_towns_from_2017, area='ward_name',
                                      target='price',
                                      ema_val=0.2)

n_table
regen_fig

In [None]:
all_yearly_dataframe.head(5)