# Import packages

In [144]:
import pandas as pd
import numpy as np
import re
from bs4 import BeautifulSoup # handle html strings

# Read Dataset

In [15]:
filename = "../datasets/PropertyData_wDesc.csv"
# Default encoding (UTF-8) not working on this dataset
data = pd.read_csv(filename, encoding="ISO8859-1")
data.head()

Unnamed: 0,Postcode,Created,Advertised,Agreed,Completed,Date Listing Last Cancelled,Sale or Let,RTD3308_outside_space1 - Outside Space Description,EweMove Description S1 Features,EweMove Description S2 Description,...,Price / Rent,Price Qualifier,Sale Price % Achieved,Current EPC - EPC Expiry Date,DESC Council Tax Band,DESC Leasehold Ground Rent,DESC Leasehold Service Charge,ZPG_lease_expiry_years_remaining,# of Enquiry or viewings,# of Apps/Offers
0,LU7 4WN,03/04/2019,03/04/2019,,,28/02/2022,Sale,Communal Garden,<ul><li>CASH BUYERS ONLY</li><li>No Upper Chai...,Cash Buyers... You could pounce super quick on...,...,"<font color='blue'>&pound;140,000</font><br>Of...",Offers In Excess Of,130.71%,11/03/2028,Band B,,,68.0,32,12
1,L23 6YD,21/03/2019,24/03/2019,05/04/2021,28/06/2021,26/02/2020,Sale,,<ul><li>Sea Views</li><li>Very Popular Locatio...,WATCHING THE SHIPS ROLL IN - from your very ow...,...,"<font color='blue'>&pound;325,000</font><br>Of...",Offers Over,,28/03/2029,Band E,,,,15,10
2,DA17 5PJ,25/03/2019,03/04/2019,25/01/2021,15/03/2021,13/06/2019,Sale,Rear Garden,<ul><li>Call NOW 24/7 or book instantly online...,"Guide price &pound;325,000 - &pound;350,000.Lo...",...,"<font color='blue'>&pound;325,000</font><br>Gu...",Guide Price,,12/07/2028,Band D,,,,14,4
3,NW4 1AP,28/06/2019,04/07/2019,,,,Sale,,<ul><li>5 bed link detached FREEHOLD house wit...,EweMove Hampstead - A rare opportunity to acqu...,...,"<font color='blue'>&pound;1,095,000</font><br>...",Guide Price,,30/06/2029,,,,,13,1
4,GL7 5UX,12/06/2019,09/08/2019,13/04/2021,05/07/2021,,Sale,Back Garden,<ul><li>Four Bedrooms</li><li>Detached</li><li...,For anyone looking for a family home in South ...,...,"<font color='blue'>&pound;395,000</font><br>",,100.00%,30/06/2024,,,,,11,9


# Inspecting the dataset

In [14]:
data.shape

(3649, 35)

In [54]:
data.isnull().sum()

Postcode                                                 0
Created                                                  0
Advertised                                             453
Agreed                                                1381
Completed                                             1483
Date Listing Last Cancelled                           2179
Sale or Let                                              0
RTD3308_outside_space1 - Outside Space Description    1450
EweMove Description S1 Features                          0
EweMove Description S2 Description                     301
EweMove Description S3 Rooms                           977
EweMove Description S4 Summary                           0
RTD3307_parking1 - Parking Description                1331
RTD3307_parking2 - Parking Description                2077
RTD3307_parking3 - Parking Description                2779
RTD3308_outside_space2 - Outside Space Description    2001
RTD3308_outside_space3 - Outside Space Description    25

# Handling HTML text

## EweMove Description S1 Features

In [104]:
string = data["EweMove Description S1 Features"][0]
soup = BeautifulSoup(string, "html.parser")
string

'<ul><li>CASH BUYERS ONLY</li><li>No Upper Chain!</li><li>Very Rare 2 Car Parking Spaces</li><li>Great Commuter Links by Road & Rail</li><li>Communal Gardens</li><li>2 Bedrooms</li><li>Cul-De-Sac Location</li><li>Brand New Bathroom</li><li>Lovely Residential Area</li></ul>'

In [105]:
print(soup.prettify())

<ul>
 <li>
  CASH BUYERS ONLY
 </li>
 <li>
  No Upper Chain!
 </li>
 <li>
  Very Rare 2 Car Parking Spaces
 </li>
 <li>
  Great Commuter Links by Road &amp; Rail
 </li>
 <li>
  Communal Gardens
 </li>
 <li>
  2 Bedrooms
 </li>
 <li>
  Cul-De-Sac Location
 </li>
 <li>
  Brand New Bathroom
 </li>
 <li>
  Lovely Residential Area
 </li>
</ul>


In [106]:
features = soup.select("li")
for feature in features:
    print(feature.string.strip())

CASH BUYERS ONLY
No Upper Chain!
Very Rare 2 Car Parking Spaces
Great Commuter Links by Road & Rail
Communal Gardens
2 Bedrooms
Cul-De-Sac Location
Brand New Bathroom
Lovely Residential Area


## EweMove Description S3 Rooms

In [107]:
string = data["EweMove Description S3 Rooms"][0]
soup = BeautifulSoup(string, "html.parser")

In [108]:
rooms = soup.select("li")
for room in rooms:
    name = room.strong.string.strip().split('-')[-1]
    try:
        area = room.i.string
    except AttributeError:
        area = 0
    print("{:25s}{}".format(name, area))

 Entrance Hall           0
 Living/Dining Room      6.58m x 3.78m (24.8 sqm) - 21' 7" x 12' 4" (267 sqft)
 Kitchen                 2.68m x 2.14m (5.7 sqm) - 8' 9" x 7' (61 sqft)
 Bedroom 1               3.37m x 2.45m (8.2 sqm) - 11' x 8' (88 sqft)
 Bedroom 2               2.54m x 2.45m (6.2 sqm) - 8' 4" x 8' (67 sqft)
 Bathroom                2.14m x 2.04m (4.3 sqm) - 7' x 6' 8" (46 sqft)
 Garden                  0
 Parking                 0


## EweMove Description S4 Summary

In [109]:
string = data["EweMove Description S4 Summary"][0]
soup = BeautifulSoup(string, "html.parser")

In [113]:
summary = soup.select("li")
for s in summary:
    attribute = s.b.string.strip(": ")
    brand = list(filter(lambda c: isinstance(c, str) and c.startswith("Band"), s.contents))[0]
    print("{:50s}{:15s}".format(attribute, brand))

Council Tax                                       Band B         
Energy Performance Certificate (EPC) Rating       Band C (69-80) 


## Price / Rent

In [142]:
string = data["Price / Rent"][0]
string

"<font color='blue'>&pound;140,000</font><br>Offers In Excess Of"

In [160]:
price_qualifier = string.split("<br>")[-1]

pattern = "[-+]?[.]?[\d]+(?:,\d\d\d)*[\.]?\d*(?:[eE][-+]?\d+)?"
price = re.findall(pattern, string)[0]

print(price_qualifier)
print(price)

Offers In Excess Of
140,000
