In [None]:
'''Variables'''

target = 'pct_change'
area = 'MSOA'
year_min = 1990
year_max = 2020
budget_min = 300000
budget_max = 550000

'''Importing Modules and Settings'''

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"


from datetime import datetime as dt

import pandas as pd
pd.set_option("display.max_columns", 50)
pd.set_option("display.max_rows", 400000)
pd.set_option("display.width", 1000)

from matplotlib import pyplot as plt
%matplotlib inline
plt.rcParams['figure.figsize'] = [20, 10]

import numpy as np

from sklearn.linear_model import LinearRegression, TheilSenRegressor
from sklearn.model_selection import train_test_split
from scipy import stats

from sklearn.preprocessing import MinMaxScaler
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.model_selection import train_test_split

import seaborn as sns

import plotly as py
import plotly.graph_objs as go
import plotly.express as px

font = {'family' : 'arial',
        'weight' : 'normal',
        'size'   : 18}

plt.rc('font', **font)



In [None]:
# list of columns from the land registry site
column_list = ['TUI',
               'price',
               'date',
               'postcode',
               'property_type',
               'old/new',
               'duration',
               'PAON',
               'SAON',
               'street',
               'locality',
               'town/city',
               'district',
               'county',
               'ppd_category',
               'record_status']

post_codes = ['WD', 'EN', 'HA', 'IG', 'RM', 'DA', 'BR', 'CR', 'KT', 'TW', 'UB']
london_post_codes = ['E', 'WC', 'EC', 'N', 'NW', 'SE', 'SW', 'W']

nrows = 10000000

# Reads in the property data CSV
main_df = pd.read_csv('pp-complete.csv', names=column_list)
print ('read property CSV'.title())

# Reads in the postcode data CSV
postcode_data = pd.read_csv('National_Statistics_Postcode_Lookup_UK.csv', usecols=['Postcode 1', 
                                                                                                 'Postcode 2',
                                                                                                 'Postcode 3', 
                                                                                                 'Easting', 
                                                                                                 'Northing', 
                                                                                                 'County Name', 
                                                                                                'Ward Name', 
                                                                                                'Lower Super Output Area Code', 
                                                                                                'Lower Super Output Area Name',
                                                                                                'Middle Super Output Area Name',
                                                                                                'Output Area Classification Name',
                                                                                                'Longitude',
                                                                                                'Latitude'])
print ('read postcode CSV'.title())
# renames the useful postcode column to match the convention in the main_df otherwise we can't merge later
postcode_data = postcode_data.rename(columns={'Postcode 3': 'postcode'})

In [None]:
# Drops unneeded columns
main_df = main_df.drop(['TUI','district', 'record_status', 'locality', 'county'], axis=1 )
print ('Dropped columns'.title())

# Removes PPD Category Type 'B' to only leave residental sales
main_df = main_df.loc[main_df['ppd_category']  == 'A']
print ('removed non-residental sale data'.title())

# Converts price to an integer
main_df['price'] = main_df['price'].apply(int)
print ('Converted prices to integers'.title())

# Removes outliers 
main_df = main_df.loc[(main_df['price'] >= 75000) & (main_df['price'] <= 2000000)]
print ('Removed outliers'.title())

# Creates a datetime object for the 'date' column, and creates columns for month and year sold. 
main_df['date_sold'] = pd.to_datetime(main_df['date'], format='%Y-%m-%d %H:%M')
print ('converted date_sold'.title())
main_df['year_sold'] = main_df['date_sold'].dt.year
print ('converted year'.title())
# main_df['month_sold'] = main_df['date'].dt.strftime('%m')
# print ('converted_month'.title())
# main_df['date_sold'] = main_df['date'].dt.strftime('%Y-%m')

# Drops unneeded date column
main_df = main_df.drop(['date'], axis=1)
print ('DateTime objects created'.title())

# converts postcodes to strings, and slices just the first two characters
main_df['postcode'] = main_df['postcode'].apply(str)
# .apply(lambda x: x[:2])
# removes the numbers from single digit postcodes 
# main_df['postcode'] = main_df['postcode'].filter(lambda x: x.isalpha(), x)
print ('Converted Postcodes'.title())

# Formats the MSOA and LSOA column
postcode_data['Middle Super Output Area Name'] = postcode_data['Middle Super Output Area Name'].apply(lambda x: x.replace(" ", ""))
postcode_data['Lower Super Output Area Name'] = postcode_data['Lower Super Output Area Name'].apply(lambda x: x.replace(" ", ""))


# Fills NaN Values
main_df = main_df.fillna(0)
print ('Removed NaN Values'.title())

# Creates a full address column
main_df['address'] = main_df['postcode'].astype(str) + main_df['street'].astype(str) + main_df['SAON'].astype(str) + main_df['PAON'].astype(str)
main_df['address'] = main_df['address'].apply(lambda x: x.replace(" ", ""))
main_df['address'] = main_df['address'].apply(lambda x: x.lower())
print ('Created Address Column')

# Drops unneeded columns
main_df = main_df.drop(['PAON', 'SAON', 'street', 'ppd_category'], axis=1 )
print ('Dropped address columns'.title())

main_df = main_df.reset_index(drop=True)
print('reset index'.title())

# Always ensures that the year we check against is always the most recent available in the dataset
# Stops me having to guess the last year when we load in different numbers of rows from the .csv
year_to_check = main_df['year_sold'].max()