# Import packages

In [1]:
import pandas as pd
import numpy as np
import parse
import re
import os
import seaborn as sns
from bs4 import BeautifulSoup # handle html strings
from matplotlib import pyplot as plt

from ProcessHTML import ProcessHTML
from ExtractRooms import ExtractRooms
from GeneralizeDataset import GeneralizeDataset
from CreateInputDataset import CreateInputDataset

# Read files

In [2]:
folder = "../datasets"
paths = [os.path.join(folder, path) for path in os.listdir(folder) if "H1" in path or "H2" in path]

In [3]:
files = [pd.read_csv(path, encoding="ISO8859-1") for path in paths]

In [4]:
files[0]

Unnamed: 0,Full Address,Created,Advertised,Agreed,Completed,Date Listing Last Cancelled,Sale or Let,RTD3308_outside_space1 - Outside Space Description,EweMove Description S1 Features,EweMove Description S2 Description,...,Price / Rent,Price Qualifier,Sale Price % Achieved,Current EPC - EPC Expiry Date,DESC Council Tax Band,DESC Leasehold Ground Rent,DESC Leasehold Service Charge,ZPG_lease_expiry_years_remaining,# of Enquiry or viewings,# of Apps/Offers
0,"2 Linden Drive, Farnham Royal SL2 3DA",01-07-2019,,,,,Sale,,<ul></ul>,,...,<font color='blue'>&pound;0</font><br>,,,,,,,,0,0
1,"14 Ploughman's Gardens, Woodmansey HU17 0GN",02-07-2019,05-07-2019,,,29-10-2019,Sale,Back Garden,<ul><li>Upgraded Modern 3 Bed Semi Detached Ho...,What a great opportunity to step onto or up th...,...,"<font color='blue'>&pound;200,000</font><br>Of...",Offers Over,,14-08-2028,Band C,,,,0,0
2,"5 White Otter Close, Birkdale, PR8 3FE",02-07-2019,,,,,Sale,,<ul></ul>,,...,<font color='blue'>&pound;0</font><br>,,,,,,,,0,0
3,"10 Longroyd Street North, Leeds LS11 5EU",02-07-2019,,,,,Rental,,<ul></ul>,,...,<font color='blue'>&pound;0</font><br>Monthly,Monthly,,,,,,,0,0
4,"28 Buttermere Close, Southampton SO16 9GL",04-07-2019,,,,04-07-2019,Rental,,<ul></ul>,,...,<font color='blue'>&pound;0</font><br>Monthly,Monthly,,,,,,,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3294,"5 Gordons, Basildon SS13 3DZ",28-11-2019,30-11-2019,09-12-2019,20-12-2019,,Rental,Back Garden,<ul><li>Call NOW 24/7 or book instantly online...,"This is a really well proportioned home, with ...",...,"<font color='blue'>&pound;1,050</font><br>Monthly",Monthly,,,Band B,,,,0,10
3295,"8 Woodward Heights, Grays RM17 5RR",06-12-2019,26-12-2019,,,08-07-2020,Sale,Rear Garden,<ul><li>Call NOW 24/7 or book instantly online...,Such a lovely detached home with space aplenty...,...,"<font color='blue'>&pound;600,000</font><br>Of...",Offers Over,,16-12-2029,Band G,,,,0,1
3296,"38 Kimberley Road, Benfleet SS7 5NQ",06-12-2019,26-12-2019,,,11-08-2020,Sale,Back Garden,<ul><li>Call NOW 24/7 or book instantly online...,This detached home has been very smartly updat...,...,"<font color='blue'>&pound;450,000</font><br>Of...",Offers Over,,08-10-2026,Band D,,,,0,0
3297,"4 Malwood Road, Benfleet SS7 5SE",06-12-2019,,,,02-07-2020,Sale,,<ul></ul>,,...,<font color='blue'>&pound;0</font><br>,,,,,,,,0,0


In [5]:
subset = pd.read_csv("../datasets/PropertyData_wDesc.csv", encoding="ISO8859-1")

# File contents

In [6]:
col_names = None
for i, file in enumerate(files):
    if col_names is None:
        col_names = sorted(file.columns)
    else:
        if col_names != sorted(file.columns):
            print("{}: ERROR".format(paths[i]))
            break

In [7]:
for new, original in zip(col_names, sorted(subset.columns)):
    if new != original:
        print("{:40s}\n{:40s}\n".format(original, new))

Postcode                                
Full Address                            



# Extract postcode from full address

In [8]:
pattern = "[A-Za-z]{1,2}[0-9Rr][0-9A-Za-z]? [0-9][ABD-HJLNP-UW-Zabd-hjlnp-uw-z]{2}"

In [9]:
msg = "Currently I live in SW5 9QN, two years ago I lived in NG9 2FF"
re.findall(pattern, msg)

['SW5 9QN', 'NG9 2FF']

In [10]:
for file in files:
    postcodes = []
    for i in file["Full Address"]:
        postcode = re.findall(pattern, i)
        if len(postcode) == 0:
            postcodes.append(np.nan)
        else:
            postcodes.append(postcode[0])
    file["Full Address"] = postcodes

In [11]:
files[0]

Unnamed: 0,Full Address,Created,Advertised,Agreed,Completed,Date Listing Last Cancelled,Sale or Let,RTD3308_outside_space1 - Outside Space Description,EweMove Description S1 Features,EweMove Description S2 Description,...,Price / Rent,Price Qualifier,Sale Price % Achieved,Current EPC - EPC Expiry Date,DESC Council Tax Band,DESC Leasehold Ground Rent,DESC Leasehold Service Charge,ZPG_lease_expiry_years_remaining,# of Enquiry or viewings,# of Apps/Offers
0,SL2 3DA,01-07-2019,,,,,Sale,,<ul></ul>,,...,<font color='blue'>&pound;0</font><br>,,,,,,,,0,0
1,HU17 0GN,02-07-2019,05-07-2019,,,29-10-2019,Sale,Back Garden,<ul><li>Upgraded Modern 3 Bed Semi Detached Ho...,What a great opportunity to step onto or up th...,...,"<font color='blue'>&pound;200,000</font><br>Of...",Offers Over,,14-08-2028,Band C,,,,0,0
2,PR8 3FE,02-07-2019,,,,,Sale,,<ul></ul>,,...,<font color='blue'>&pound;0</font><br>,,,,,,,,0,0
3,LS11 5EU,02-07-2019,,,,,Rental,,<ul></ul>,,...,<font color='blue'>&pound;0</font><br>Monthly,Monthly,,,,,,,0,0
4,SO16 9GL,04-07-2019,,,,04-07-2019,Rental,,<ul></ul>,,...,<font color='blue'>&pound;0</font><br>Monthly,Monthly,,,,,,,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3294,SS13 3DZ,28-11-2019,30-11-2019,09-12-2019,20-12-2019,,Rental,Back Garden,<ul><li>Call NOW 24/7 or book instantly online...,"This is a really well proportioned home, with ...",...,"<font color='blue'>&pound;1,050</font><br>Monthly",Monthly,,,Band B,,,,0,10
3295,RM17 5RR,06-12-2019,26-12-2019,,,08-07-2020,Sale,Rear Garden,<ul><li>Call NOW 24/7 or book instantly online...,Such a lovely detached home with space aplenty...,...,"<font color='blue'>&pound;600,000</font><br>Of...",Offers Over,,16-12-2029,Band G,,,,0,1
3296,SS7 5NQ,06-12-2019,26-12-2019,,,11-08-2020,Sale,Back Garden,<ul><li>Call NOW 24/7 or book instantly online...,This detached home has been very smartly updat...,...,"<font color='blue'>&pound;450,000</font><br>Of...",Offers Over,,08-10-2026,Band D,,,,0,0
3297,SS7 5SE,06-12-2019,,,,02-07-2020,Sale,,<ul></ul>,,...,<font color='blue'>&pound;0</font><br>,,,,,,,,0,0


In [12]:
file = files[0]
file = file.rename(columns={"Full Address": "Postcode"})
file = file[file["Postcode"].notna()]
file = file.rename(index={i: j for i, j in zip(file.index, range(len(file)))})
file

Unnamed: 0,Postcode,Created,Advertised,Agreed,Completed,Date Listing Last Cancelled,Sale or Let,RTD3308_outside_space1 - Outside Space Description,EweMove Description S1 Features,EweMove Description S2 Description,...,Price / Rent,Price Qualifier,Sale Price % Achieved,Current EPC - EPC Expiry Date,DESC Council Tax Band,DESC Leasehold Ground Rent,DESC Leasehold Service Charge,ZPG_lease_expiry_years_remaining,# of Enquiry or viewings,# of Apps/Offers
0,SL2 3DA,01-07-2019,,,,,Sale,,<ul></ul>,,...,<font color='blue'>&pound;0</font><br>,,,,,,,,0,0
1,HU17 0GN,02-07-2019,05-07-2019,,,29-10-2019,Sale,Back Garden,<ul><li>Upgraded Modern 3 Bed Semi Detached Ho...,What a great opportunity to step onto or up th...,...,"<font color='blue'>&pound;200,000</font><br>Of...",Offers Over,,14-08-2028,Band C,,,,0,0
2,PR8 3FE,02-07-2019,,,,,Sale,,<ul></ul>,,...,<font color='blue'>&pound;0</font><br>,,,,,,,,0,0
3,LS11 5EU,02-07-2019,,,,,Rental,,<ul></ul>,,...,<font color='blue'>&pound;0</font><br>Monthly,Monthly,,,,,,,0,0
4,SO16 9GL,04-07-2019,,,,04-07-2019,Rental,,<ul></ul>,,...,<font color='blue'>&pound;0</font><br>Monthly,Monthly,,,,,,,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3269,SS13 3DZ,28-11-2019,30-11-2019,09-12-2019,20-12-2019,,Rental,Back Garden,<ul><li>Call NOW 24/7 or book instantly online...,"This is a really well proportioned home, with ...",...,"<font color='blue'>&pound;1,050</font><br>Monthly",Monthly,,,Band B,,,,0,10
3270,RM17 5RR,06-12-2019,26-12-2019,,,08-07-2020,Sale,Rear Garden,<ul><li>Call NOW 24/7 or book instantly online...,Such a lovely detached home with space aplenty...,...,"<font color='blue'>&pound;600,000</font><br>Of...",Offers Over,,16-12-2029,Band G,,,,0,1
3271,SS7 5NQ,06-12-2019,26-12-2019,,,11-08-2020,Sale,Back Garden,<ul><li>Call NOW 24/7 or book instantly online...,This detached home has been very smartly updat...,...,"<font color='blue'>&pound;450,000</font><br>Of...",Offers Over,,08-10-2026,Band D,,,,0,0
3272,SS7 5SE,06-12-2019,,,,02-07-2020,Sale,,<ul></ul>,,...,<font color='blue'>&pound;0</font><br>,,,,,,,,0,0


# Test compatibility with CreateInputDataset

In [13]:
files = [pd.read_csv(path, encoding="ISO8859-1") for path in paths]

In [14]:
creation = CreateInputDataset(files[0], extract_postcode=True)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


In [15]:
creation.get_general_dataset()

Unnamed: 0,Postcode,Sale or Let,Price Qualifier,DESC Council Tax Band,RTD3316_condition1 - Condition Description,# of Enquiry or viewings,# of Apps/Offers
1,850,1,7,2,0,0,0
23,1802,0,4,0,0,0,4
24,1805,1,8,1,0,0,1
95,1809,0,4,1,0,0,10
103,1808,0,4,0,0,0,2
...,...,...,...,...,...,...,...
3264,219,0,4,2,0,0,2
3268,2334,0,4,3,0,0,3
3269,2308,0,4,1,0,0,10
3270,2040,1,7,6,0,0,1


In [16]:
creation.get_room_dataset()

Unnamed: 0,bedroom number,kitchen number,living number,bathroom number,dining number,other number,other area
1,3,1,0,1,1,7,20.8
23,2,1,1,1,1,4,6.0
24,3,1,2,1,0,3,0.0
95,3,1,2,1,0,2,0.0
103,2,1,1,1,1,2,0.0
...,...,...,...,...,...,...,...
3264,2,1,0,1,1,5,16.0
3268,2,1,0,1,0,3,24.8
3269,3,1,0,1,0,5,62.7
3270,4,1,0,2,1,9,603.7


In [17]:
result = creation.get_categorical_dataset()
result

Unnamed: 0,Allocated,Communal,Covered,Driveway,Garage,Gated,Off Street,On Street,Permit,Private,...,Central,Double Glazing,Eco-Friendly,Electric,Gas,Gas Central,Night Storage,Oil,Solar,Under Floor
1,0,0,0,1,0,0,1,0,0,0,...,0,1,0,0,0,1,0,0,0,0
23,0,0,0,1,0,0,1,0,0,1,...,1,1,0,0,0,1,0,0,0,0
24,0,0,0,1,1,0,1,0,0,0,...,1,1,0,0,0,1,0,0,0,0
95,0,0,0,1,0,0,1,0,0,1,...,1,1,0,0,0,1,0,0,0,0
103,0,0,0,1,0,0,1,0,0,1,...,1,1,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3264,0,0,0,1,0,0,1,0,0,0,...,0,0,0,0,0,1,0,0,0,0
3268,0,0,0,0,0,1,1,0,0,0,...,0,1,0,0,0,1,0,0,0,0
3269,0,1,0,0,0,0,0,0,0,0,...,0,1,0,0,0,1,0,0,0,0
3270,0,0,0,1,1,0,1,0,0,0,...,0,1,0,0,0,1,0,0,0,0


In [18]:
creation.get_labels()

Unnamed: 0,Completed,Price
1,1,200000.0
23,0,695.0
24,1,170000.0
95,0,675.0
103,0,650.0
...,...,...
3264,0,1100.0
3268,0,1100.0
3269,0,1050.0
3270,1,600000.0
