# Import packages

In [217]:
import pandas as pd
import numpy as np
import parse
import re
from bs4 import BeautifulSoup # handle html strings
from ProcessHTML import ProcessHTML

# Read Dataset

In [206]:
filename = "../datasets/PropertyData_wDesc.csv"
# Default encoding (UTF-8) not working on this dataset
data = pd.read_csv(filename, encoding="ISO8859-1")
data.head()

Unnamed: 0,Postcode,Created,Advertised,Agreed,Completed,Date Listing Last Cancelled,Sale or Let,RTD3308_outside_space1 - Outside Space Description,EweMove Description S1 Features,EweMove Description S2 Description,...,Price / Rent,Price Qualifier,Sale Price % Achieved,Current EPC - EPC Expiry Date,DESC Council Tax Band,DESC Leasehold Ground Rent,DESC Leasehold Service Charge,ZPG_lease_expiry_years_remaining,# of Enquiry or viewings,# of Apps/Offers
0,LU7 4WN,03/04/2019,03/04/2019,,,28/02/2022,Sale,Communal Garden,<ul><li>CASH BUYERS ONLY</li><li>No Upper Chai...,Cash Buyers... You could pounce super quick on...,...,"<font color='blue'>&pound;140,000</font><br>Of...",Offers In Excess Of,130.71%,11/03/2028,Band B,,,68.0,32,12
1,L23 6YD,21/03/2019,24/03/2019,05/04/2021,28/06/2021,26/02/2020,Sale,,<ul><li>Sea Views</li><li>Very Popular Locatio...,WATCHING THE SHIPS ROLL IN - from your very ow...,...,"<font color='blue'>&pound;325,000</font><br>Of...",Offers Over,,28/03/2029,Band E,,,,15,10
2,DA17 5PJ,25/03/2019,03/04/2019,25/01/2021,15/03/2021,13/06/2019,Sale,Rear Garden,<ul><li>Call NOW 24/7 or book instantly online...,"Guide price &pound;325,000 - &pound;350,000.Lo...",...,"<font color='blue'>&pound;325,000</font><br>Gu...",Guide Price,,12/07/2028,Band D,,,,14,4
3,NW4 1AP,28/06/2019,04/07/2019,,,,Sale,,<ul><li>5 bed link detached FREEHOLD house wit...,EweMove Hampstead - A rare opportunity to acqu...,...,"<font color='blue'>&pound;1,095,000</font><br>...",Guide Price,,30/06/2029,,,,,13,1
4,GL7 5UX,12/06/2019,09/08/2019,13/04/2021,05/07/2021,,Sale,Back Garden,<ul><li>Four Bedrooms</li><li>Detached</li><li...,For anyone looking for a family home in South ...,...,"<font color='blue'>&pound;395,000</font><br>",,100.00%,30/06/2024,,,,,11,9


# Inspecting the dataset

In [157]:
data.shape

(3649, 35)

In [158]:
data.isnull().sum()

Postcode                                                 0
Created                                                  0
Advertised                                             453
Agreed                                                1381
Completed                                             1483
Date Listing Last Cancelled                           2179
Sale or Let                                              0
RTD3308_outside_space1 - Outside Space Description    1450
EweMove Description S1 Features                          0
EweMove Description S2 Description                     301
EweMove Description S3 Rooms                           977
EweMove Description S4 Summary                           0
RTD3307_parking1 - Parking Description                1331
RTD3307_parking2 - Parking Description                2077
RTD3307_parking3 - Parking Description                2779
RTD3308_outside_space2 - Outside Space Description    2001
RTD3308_outside_space3 - Outside Space Description    25

# Handling HTML text

## EweMove Description S1 Features

In [159]:
string = data["EweMove Description S1 Features"][0]
soup = BeautifulSoup(string, "html.parser")
string

'<ul><li>CASH BUYERS ONLY</li><li>No Upper Chain!</li><li>Very Rare 2 Car Parking Spaces</li><li>Great Commuter Links by Road & Rail</li><li>Communal Gardens</li><li>2 Bedrooms</li><li>Cul-De-Sac Location</li><li>Brand New Bathroom</li><li>Lovely Residential Area</li></ul>'

In [160]:
print(soup.prettify())

<ul>
 <li>
  CASH BUYERS ONLY
 </li>
 <li>
  No Upper Chain!
 </li>
 <li>
  Very Rare 2 Car Parking Spaces
 </li>
 <li>
  Great Commuter Links by Road &amp; Rail
 </li>
 <li>
  Communal Gardens
 </li>
 <li>
  2 Bedrooms
 </li>
 <li>
  Cul-De-Sac Location
 </li>
 <li>
  Brand New Bathroom
 </li>
 <li>
  Lovely Residential Area
 </li>
</ul>


In [161]:
features = soup.select("li")
for feature in features:
    print(feature.string.strip())

CASH BUYERS ONLY
No Upper Chain!
Very Rare 2 Car Parking Spaces
Great Commuter Links by Road & Rail
Communal Gardens
2 Bedrooms
Cul-De-Sac Location
Brand New Bathroom
Lovely Residential Area


## EweMove Description S3 Rooms

In [162]:
string = data["EweMove Description S3 Rooms"][6]
soup = BeautifulSoup(string, "html.parser")

In [163]:
rooms = soup.select("li")
for room in rooms:
    name = room.strong.string.split('-')[-1].strip()
    try:
        area = room.i.string
    except AttributeError:
        area = 1
    print("{:25s}{}".format(name, area))

Living Room              4.34m x 4.11m (17.8 sqm) - 14' 3" x 13' 5" (192 sqft)
Dining Room              3.13m x 2.83m (8.8 sqm) - 10' 3" x 9' 3" (95 sqft)
Kitchen                  3.12m x 2.83m (8.8 sqm) - 10' 2" x 9' 3" (95 sqft)
Play Room                2.29m x 2.04m (4.6 sqm) - 7' 6" x 6' 8" (50 sqft)
Bedroom 1                4.05m x 3.38m (13.7 sqm) - 13' 3" x 11' 1" (147 sqft)
Bedroom 2                4.11m x 2.29m (9.4 sqm) - 13' 5" x 7' 6" (101 sqft)
Bedroom 3                3.36m x 2.86m (9.6 sqm) - 11' x 9' 4" (103 sqft)
Garage                   5.09m x 2.48m (12.6 sqm) - 16' 8" x 8' 1" (136 sqft)


## EweMove Description S4 Summary

In [164]:
string = data["EweMove Description S4 Summary"][26]
soup = BeautifulSoup(string, "html.parser")

In [165]:
summary = list(filter(lambda s: s.startswith("<b>"), string.split("<li>")))
summary

['<b>HUGE Summerhouse</b><br>',
 '<b>Extra long garage</b><br>',
 '<b>Parking for 4 cars</b><br>',
 '<b>Great internal space</b><br>',
 "<b>South-facing garden</b></li><br><br>Call 24/7 to register your interest or go online to book a viewing at a time that suits you - it's that easy."]

In [166]:
single = parse.compile("<b>{}</b><br>")
double = parse.compile("<b>{}</b><br><br>{}<br><br>")
final1 = parse.compile("<b>{}</b><br><br>{}<br></li>")
final2 = parse.compile("<b>{}</b><br><br>{}<br></li><br><br>{}")
final3 = parse.compile("<b>{}</b></li>")
final4 = parse.compile("<b>{}</b></li><br><br>{}")
parsers = [single, double, final1, final2, final3, final4]
for s in summary:
    result = list(filter(lambda r: r is not None, map(lambda p: p.parse(s), parsers)))[0]
    print(result.fixed)

('HUGE Summerhouse',)
('Extra long garage',)
('Parking for 4 cars',)
('Great internal space',)
('South-facing garden', "Call 24/7 to register your interest or go online to book a viewing at a time that suits you - it's that easy.")


## Price / Rent

In [167]:
string = data["Price / Rent"][3632]
string

"<font color='blue'>&pound;0</font><br>"

In [168]:
price_qualifier = string.split("<br>")[-1]

pattern = "[-+]?[.]?[\d]+(?:,\d\d\d)*[\.]?\d*(?:[eE][-+]?\d+)?"
price = re.findall(pattern, string)[0]

print(price_qualifier)
print(price)


0


# Preprocess records

## Parking spaces

In [169]:
parkings_name = list(filter(lambda s: "parking" in s, data.columns))
parkings = data[parkings_name].copy()
parkings.head()

Unnamed: 0,RTD3307_parking1 - Parking Description,RTD3307_parking2 - Parking Description,RTD3307_parking3 - Parking Description
0,Allocated,Off Street,Residents
1,,,
2,On Street,,
3,,,
4,Driveway,Garage,Off Street


### Types of parking spaces for every property
First obtaining all the parking types in the original dataset, then get the parking types for every property

In [170]:
parkings = parkings.fillna("None")
types, counts = np.unique(parkings.to_numpy().flatten(), return_counts=True)
list(zip(types, counts))

[('Allocated', 343),
 ('Communal', 81),
 ('Covered', 36),
 ('Driveway', 1257),
 ('Garage', 842),
 ('Gated', 89),
 ('None', 6187),
 ('Off Street', 1209),
 ('On Street', 458),
 ('Permit', 70),
 ('Private', 258),
 ('Rear', 27),
 ('Residents', 90)]

In [171]:
def get_parking_spaces_for_property(first: str, second: str, third: str):
    parking_types = {"Allocated": 0, "Off Street": 0, "Residents": 0, "On Street": 0, "Driveway": 0, "Garage": 0,
                     "Permit": 0, "Private": 0, "Gated": 0, "Covered": 0, "Communal": 0, "Rear": 0}

    if first in parking_types.keys():
        parking_types[first] += 1
    if second in parking_types.keys():
        parking_types[second] += 1
    if third in parking_types.keys():
        parking_types[third] += 1

    return parking_types

In [172]:
parking_types = {key: [] for key in types if key != "None"}
for i in range(len(parkings)):
    first = parkings.iloc[i, 0]
    second = parkings.iloc[i, 1]
    third = parkings.iloc[i, 2]
    types = get_parking_spaces_for_property(first, second, third)

    list(map(lambda value, d: parking_types[d].append(types[value]), types.keys(), types.keys()))

In [173]:
for k, v in parking_types.items():
    print("{:10s}\t{:5d}\t{:5d}".format(k, sum(v), len(v)))

Allocated 	  343	 3649
Communal  	   81	 3649
Covered   	   36	 3649
Driveway  	 1257	 3649
Garage    	  842	 3649
Gated     	   89	 3649
Off Street	 1209	 3649
On Street 	  458	 3649
Permit    	   70	 3649
Private   	  258	 3649
Rear      	   27	 3649
Residents 	   90	 3649


### Number of parking spaces for every property

In [174]:
parkings = data[parkings_name]
parking_num = [parkings.iloc[i].count() for i in range(len(parkings))]
parking_num = pd.DataFrame({"parking num": parking_num})
parking_num.head()

Unnamed: 0,parking num
0,3
1,0
2,1
3,0
4,3


In [175]:
parking_num.sum()

parking num    4760
dtype: int64

## Outside spaces

In [176]:
outside_name = list(filter(lambda s: "outside_space" in s, data.columns))
outside = data[outside_name].copy()
outside.head()

Unnamed: 0,RTD3308_outside_space1 - Outside Space Description,RTD3308_outside_space2 - Outside Space Description,RTD3308_outside_space3 - Outside Space Description
0,Communal Garden,,
1,,,
2,Rear Garden,Private Garden,
3,,,
4,Back Garden,Enclosed Garden,Private Garden


### Types of outside spaces for every property

In [177]:
def record_types(first: str, second: str, third: str, types: dict):
    if first in types.keys():
        types[first] += 1
    if second in types.keys():
        types[second] += 1
    if third in types.keys():
        types[third] += 1

    return types

In [178]:
outside = outside.fillna("None")
types, counts = np.unique(outside.to_numpy().flatten(), return_counts=True)
list(zip(types, counts))

[('Back Garden', 1366),
 ('Communal Garden', 227),
 ('Enclosed Garden', 735),
 ('Front Garden', 682),
 ('None', 6012),
 ('Patio', 431),
 ('Private Garden', 599),
 ('Rear Garden', 802),
 ('Terrace', 93)]

In [179]:
outside_types = {key: [] for key in types if key != "None"}
for i in range(len(outside)):
    types = {key: 0 for key in types if key != "None"}
    first = outside.iloc[i, 0]
    second = outside.iloc[i, 1]
    third = outside.iloc[i, 2]
    types = record_types(first, second, third, types)

    list(map(lambda value, d: outside_types[d].append(types[value]), types.keys(), types.keys()))

In [180]:
for k, v in outside_types.items():
    print("{:15s}{:5d}".format(k, sum(v)))

Back Garden     1366
Communal Garden  227
Enclosed Garden  735
Front Garden     682
Patio            431
Private Garden   599
Rear Garden      802
Terrace           93


### Number of outside spaces for every property

In [181]:
outside = data[outside_name]
outside_num = [outside.iloc[i].count() for i in range(len(outside))]
temp = pd.DataFrame({"outside_num": outside_num})
temp.head()

Unnamed: 0,outside_num
0,1
1,0
2,2
3,0
4,3


In [182]:
temp.sum()

outside_num    4935
dtype: int64

## Condition

In [183]:
condition_names = list(filter(lambda s: "condition" in s, data.columns))
condition = data[condition_names].copy()
condition.head()

Unnamed: 0,RTD3316_condition1 - Condition Description
0,Good
1,
2,Good
3,
4,Good


In [184]:
condition = condition.fillna("None")
condition.head()

Unnamed: 0,RTD3316_condition1 - Condition Description
0,Good
1,
2,Good
3,
4,Good


## Heating

In [185]:
heating_names = list(filter(lambda s: "heating" in s, data.columns))
heating = data[heating_names].copy()
heating.head()

Unnamed: 0,RTD3318_heating1 - Heating Description,RTD3318_heating2 - Heating Description,RTD3318_heating3 - Heating Description
0,Double Glazing,Electric,Night Storage
1,,,
2,Central,Gas Central,Double Glazing
3,,,
4,Central,Gas,Double Glazing


In [186]:
heating = heating.fillna("None")
types, count = np.unique(heating.to_numpy().flatten(), return_counts=True)
list(zip(types, count))

[('Air Conditioning', 6),
 ('Central', 476),
 ('Double Glazing', 1550),
 ('Eco-Friendly', 17),
 ('Electric', 183),
 ('Gas', 493),
 ('Gas Central', 1814),
 ('Night Storage', 24),
 ('None', 6270),
 ('Oil', 63),
 ('Solar', 15),
 ('Solar Water', 2),
 ('Under Floor', 34)]

### Types of heating for every property

In [187]:
heating_types = {key: [] for key in types if key != "None"}
for i in range(len(heating)):
    types = {key: 0 for key in types if key != "None"}

    first = heating.iloc[i, 0]
    second = heating.iloc[i, 1]
    third = heating.iloc[i, 2]
    types = record_types(first, second, third, types)

    list(map(lambda d: heating_types[d].append(types[d]), types.keys()))

In [188]:
for k, v in heating_types.items():
    print("{:20s}{:5d}".format(k, sum(v)))

Air Conditioning        6
Central               476
Double Glazing       1550
Eco-Friendly           17
Electric              183
Gas                   493
Gas Central          1814
Night Storage          24
Oil                    63
Solar                  15
Solar Water             2
Under Floor            34


### Number of heating

In [189]:
heating = data[heating_names]
heating_num = [heating.iloc[i].count() for i in range(len(heating))]
temp = pd.DataFrame({"heating_num": heating_num})
temp.head()

Unnamed: 0,heating_num
0,3
1,0
2,3
3,0
4,3


In [190]:
temp.sum()

heating_num    4677
dtype: int64

## Accessibility

In [209]:
accessibility_name = list(filter(lambda s: "accessibility" in s, data.columns))
accessibility = data[accessibility_name].copy()
accessibility.head()

Unnamed: 0,RTD3317_accessibility1 - accessibility Description,RTD3317_accessibility2 - accessibility Description,RTD3317_accessibility3 - accessibility Description
0,Not suitable for wheelchair users,,
1,,,
2,,,
3,,,
4,Level access,,


In [212]:
accessibility = accessibility.fillna("None")
types, count = np.unique(accessibility.to_numpy().flatten(), return_counts=True)

### Types of accessibility

In [213]:
accessibility_type = {key: [] for key in types if key != "None"}
for i in range(len(accessibility)):
    types = {key: 0 for key in types if key != "None"}

    first = accessibility.iloc[i, 0]
    second = accessibility.iloc[i, 1]
    third = accessibility.iloc[i, 2]
    types = record_types(first, second, third, types)

    list(map(lambda d: accessibility_type[d].append(types[d]), types.keys()))

In [214]:
for k, v in accessibility_type.items():
    print("{:35s}{:5d}".format(k, sum(v)))

Level access                         522
Lift access                           64
Not suitable for wheelchair users    779
Ramped access                         24
Wet room                              19
Wide doorways                         47


### Number of accessibility

In [215]:
accessibility = data[accessibility_name]
accessibility_num = [accessibility.iloc[i].count() for i in range(len(accessibility))]
temp = pd.DataFrame({"accessibility_num": accessibility_num})
temp

Unnamed: 0,accessibility_num
0,1
1,0
2,0
3,0
4,1
...,...
3644,0
3645,0
3646,0
3647,0


In [216]:
temp.sum()

accessibility_num    1455
dtype: int64

## Rooms

In [257]:
extract_area = parse.compile("{} ({} sqm){}")

### Bedrooms

In [218]:
info = data["EweMove Description S3 Rooms"]
handler = ProcessHTML()
for i in info:
    handler.EweMove_Description_S3_Rooms(i)

In [222]:
bedroom_names = [i for i in handler.s3_rooms_set if "Bedroom" in i]
bedroom_names

['Bedroom (Double) with Ensuite',
 'Bedroom 7',
 'Bedroom 8',
 'Master Bedroom with Ensuite',
 'Bedroom 6',
 'Bedroom 5',
 'Bedroom 4',
 'Bedroom/Living Room',
 'Bedroom 3',
 'Master Bedroom',
 'Loft Bedroom',
 'Bedroom (Single)',
 'Bedroom (Double)',
 'Bedroom',
 'Bedroom (Single) with Ensuite',
 'Bedroom 2',
 'Attic Bedroom',
 'Bedroom 1']

In [226]:
for room in handler.s3_rooms:
    if room is None: continue
    for k, v in room.items():
        if k in bedroom_names:
            print("{:30s}{}".format(k, v))

Bedroom 1                     3.37m x 2.45m (8.2 sqm) - 11' x 8' (88 sqft)
Bedroom 2                     2.54m x 2.45m (6.2 sqm) - 8' 4" x 8' (67 sqft)
Bedroom 1                     4.07m x 2.43m (9.8 sqm) - 13' 4" x 7' 11" (106 sqft)
Bedroom 2                     3.48m x 2.43m (8.4 sqm) - 11' 5" x 7' 11" (91 sqft)
Bedroom 3                     5.74m x 2.31m (13.2 sqm) - 18' 9" x 7' 6" (142 sqft)
Bedroom 4                     5.34m x 3.13m (16.7 sqm) - 17' 6" x 10' 3" (179 sqft)
Bedroom (Double)              4.3m x 2.44m (10.4 sqm) - 14' 1" x 8' (112 sqft)
Master Bedroom                4.83m x 2.29m (11 sqm) - 15' 10" x 7' 6" (119 sqft)
Bedroom 2                     3.24m x 3.01m (9.7 sqm) - 10' 7" x 9' 10" (104 sqft)
Bedroom 3                     2.86m x 2.62m (7.4 sqm) - 9' 4" x 8' 7" (80 sqft)
Bedroom 4                     2.86m x 2.62m (7.4 sqm) - 9' 4" x 8' 7" (80 sqft)
Bedroom 1                     5m x 4.09m (20.5 sqm) - 16' 4" x 13' 5" (220 sqft)
Bedroom 2                     5

#### Obtain the maximum number of bedrooms in a property

In [247]:
bedrooms_num = []
for room in handler.s3_rooms:
    if room is None:
        bedrooms_num.append(0)
        continue

    bedrooms_num.append(len([key for key in room.keys() if "Bedroom" in key]))
max_bedroom_num = max(bedrooms_num)

### Bedroom area

In [260]:
bedroom_names = ["Bedroom {}".format(i + 1) for i in range(max_bedroom_num)]
bedrooms = []
for i in range(len(handler.s3_rooms)):
    bedroom_per_property = {key: 0 for key in bedroom_names}
    if handler.s3_rooms[i] is None:
        bedrooms.append(bedroom_per_property)
        continue

    num = 1
    for k, v in handler.s3_rooms[i].items():
        if "Bedroom" not in k:
            continue

        if not isinstance(v, str):
            bedroom_per_property["Bedroom {}".format(num)] = 0
            num += 1
            continue

        bedroom_per_property["Bedroom {}".format(num)] = float(extract_area.parse(handler.s3_rooms[i][k])[1])
        num += 1

    bedrooms.append(bedroom_per_property)

In [261]:
bedrooms

[{'Bedroom 1': 8.2,
  'Bedroom 2': 6.2,
  'Bedroom 3': 0,
  'Bedroom 4': 0,
  'Bedroom 5': 0,
  'Bedroom 6': 0,
  'Bedroom 7': 0,
  'Bedroom 8': 0},
 {'Bedroom 1': 9.8,
  'Bedroom 2': 8.4,
  'Bedroom 3': 13.2,
  'Bedroom 4': 16.7,
  'Bedroom 5': 0,
  'Bedroom 6': 0,
  'Bedroom 7': 0,
  'Bedroom 8': 0},
 {'Bedroom 1': 10.4,
  'Bedroom 2': 0,
  'Bedroom 3': 0,
  'Bedroom 4': 0,
  'Bedroom 5': 0,
  'Bedroom 6': 0,
  'Bedroom 7': 0,
  'Bedroom 8': 0},
 {'Bedroom 1': 0,
  'Bedroom 2': 0,
  'Bedroom 3': 0,
  'Bedroom 4': 0,
  'Bedroom 5': 0,
  'Bedroom 6': 0,
  'Bedroom 7': 0,
  'Bedroom 8': 0},
 {'Bedroom 1': 11.0,
  'Bedroom 2': 9.7,
  'Bedroom 3': 7.4,
  'Bedroom 4': 7.4,
  'Bedroom 5': 0,
  'Bedroom 6': 0,
  'Bedroom 7': 0,
  'Bedroom 8': 0},
 {'Bedroom 1': 20.5,
  'Bedroom 2': 24.0,
  'Bedroom 3': 13.6,
  'Bedroom 4': 10.8,
  'Bedroom 5': 0,
  'Bedroom 6': 0,
  'Bedroom 7': 0,
  'Bedroom 8': 0},
 {'Bedroom 1': 13.7,
  'Bedroom 2': 9.4,
  'Bedroom 3': 9.6,
  'Bedroom 4': 0,
  'Bedroom 5'