# Import packages

In [1]:
import pandas as pd
import numpy as np
import parse
import re
from bs4 import BeautifulSoup # handle html strings
from ProcessHTML import ProcessHTML

# Read Dataset

In [2]:
filename = "../datasets/PropertyData_wDesc.csv"
# Default encoding (UTF-8) not working on this dataset
data = pd.read_csv(filename, encoding="ISO8859-1")
data.head()

Unnamed: 0,Postcode,Created,Advertised,Agreed,Completed,Date Listing Last Cancelled,Sale or Let,RTD3308_outside_space1 - Outside Space Description,EweMove Description S1 Features,EweMove Description S2 Description,...,Price / Rent,Price Qualifier,Sale Price % Achieved,Current EPC - EPC Expiry Date,DESC Council Tax Band,DESC Leasehold Ground Rent,DESC Leasehold Service Charge,ZPG_lease_expiry_years_remaining,# of Enquiry or viewings,# of Apps/Offers
0,LU7 4WN,03/04/2019,03/04/2019,,,28/02/2022,Sale,Communal Garden,<ul><li>CASH BUYERS ONLY</li><li>No Upper Chai...,Cash Buyers... You could pounce super quick on...,...,"<font color='blue'>&pound;140,000</font><br>Of...",Offers In Excess Of,130.71%,11/03/2028,Band B,,,68.0,32,12
1,L23 6YD,21/03/2019,24/03/2019,05/04/2021,28/06/2021,26/02/2020,Sale,,<ul><li>Sea Views</li><li>Very Popular Locatio...,WATCHING THE SHIPS ROLL IN - from your very ow...,...,"<font color='blue'>&pound;325,000</font><br>Of...",Offers Over,,28/03/2029,Band E,,,,15,10
2,DA17 5PJ,25/03/2019,03/04/2019,25/01/2021,15/03/2021,13/06/2019,Sale,Rear Garden,<ul><li>Call NOW 24/7 or book instantly online...,"Guide price &pound;325,000 - &pound;350,000.Lo...",...,"<font color='blue'>&pound;325,000</font><br>Gu...",Guide Price,,12/07/2028,Band D,,,,14,4
3,NW4 1AP,28/06/2019,04/07/2019,,,,Sale,,<ul><li>5 bed link detached FREEHOLD house wit...,EweMove Hampstead - A rare opportunity to acqu...,...,"<font color='blue'>&pound;1,095,000</font><br>...",Guide Price,,30/06/2029,,,,,13,1
4,GL7 5UX,12/06/2019,09/08/2019,13/04/2021,05/07/2021,,Sale,Back Garden,<ul><li>Four Bedrooms</li><li>Detached</li><li...,For anyone looking for a family home in South ...,...,"<font color='blue'>&pound;395,000</font><br>",,100.00%,30/06/2024,,,,,11,9


# Inspecting the dataset

In [3]:
data.shape

(3649, 35)

In [4]:
data.isnull().sum()

Postcode                                                 0
Created                                                  0
Advertised                                             453
Agreed                                                1381
Completed                                             1483
Date Listing Last Cancelled                           2179
Sale or Let                                              0
RTD3308_outside_space1 - Outside Space Description    1450
EweMove Description S1 Features                          0
EweMove Description S2 Description                     301
EweMove Description S3 Rooms                           977
EweMove Description S4 Summary                           0
RTD3307_parking1 - Parking Description                1331
RTD3307_parking2 - Parking Description                2077
RTD3307_parking3 - Parking Description                2779
RTD3308_outside_space2 - Outside Space Description    2001
RTD3308_outside_space3 - Outside Space Description    25

# Handling HTML text

## EweMove Description S1 Features

In [5]:
string = data["EweMove Description S1 Features"][0]
soup = BeautifulSoup(string, "html.parser")
string

'<ul><li>CASH BUYERS ONLY</li><li>No Upper Chain!</li><li>Very Rare 2 Car Parking Spaces</li><li>Great Commuter Links by Road & Rail</li><li>Communal Gardens</li><li>2 Bedrooms</li><li>Cul-De-Sac Location</li><li>Brand New Bathroom</li><li>Lovely Residential Area</li></ul>'

In [6]:
print(soup.prettify())

<ul>
 <li>
  CASH BUYERS ONLY
 </li>
 <li>
  No Upper Chain!
 </li>
 <li>
  Very Rare 2 Car Parking Spaces
 </li>
 <li>
  Great Commuter Links by Road &amp; Rail
 </li>
 <li>
  Communal Gardens
 </li>
 <li>
  2 Bedrooms
 </li>
 <li>
  Cul-De-Sac Location
 </li>
 <li>
  Brand New Bathroom
 </li>
 <li>
  Lovely Residential Area
 </li>
</ul>


In [7]:
features = soup.select("li")
for feature in features:
    print(feature.string.strip())

CASH BUYERS ONLY
No Upper Chain!
Very Rare 2 Car Parking Spaces
Great Commuter Links by Road & Rail
Communal Gardens
2 Bedrooms
Cul-De-Sac Location
Brand New Bathroom
Lovely Residential Area


## EweMove Description S3 Rooms

In [8]:
string = data["EweMove Description S3 Rooms"][6]
soup = BeautifulSoup(string, "html.parser")

In [9]:
rooms = soup.select("li")
for room in rooms:
    name = room.strong.string.split('-')[-1].strip()
    try:
        area = room.i.string
    except AttributeError:
        area = 1
    print("{:25s}{}".format(name, area))

Living Room              4.34m x 4.11m (17.8 sqm) - 14' 3" x 13' 5" (192 sqft)
Dining Room              3.13m x 2.83m (8.8 sqm) - 10' 3" x 9' 3" (95 sqft)
Kitchen                  3.12m x 2.83m (8.8 sqm) - 10' 2" x 9' 3" (95 sqft)
Play Room                2.29m x 2.04m (4.6 sqm) - 7' 6" x 6' 8" (50 sqft)
Bedroom 1                4.05m x 3.38m (13.7 sqm) - 13' 3" x 11' 1" (147 sqft)
Bedroom 2                4.11m x 2.29m (9.4 sqm) - 13' 5" x 7' 6" (101 sqft)
Bedroom 3                3.36m x 2.86m (9.6 sqm) - 11' x 9' 4" (103 sqft)
Garage                   5.09m x 2.48m (12.6 sqm) - 16' 8" x 8' 1" (136 sqft)


## EweMove Description S4 Summary

In [10]:
string = data["EweMove Description S4 Summary"][26]
soup = BeautifulSoup(string, "html.parser")

In [11]:
summary = list(filter(lambda s: s.startswith("<b>"), string.split("<li>")))
summary

['<b>HUGE Summerhouse</b><br>',
 '<b>Extra long garage</b><br>',
 '<b>Parking for 4 cars</b><br>',
 '<b>Great internal space</b><br>',
 "<b>South-facing garden</b></li><br><br>Call 24/7 to register your interest or go online to book a viewing at a time that suits you - it's that easy."]

In [12]:
single = parse.compile("<b>{}</b><br>")
double = parse.compile("<b>{}</b><br><br>{}<br><br>")
final1 = parse.compile("<b>{}</b><br><br>{}<br></li>")
final2 = parse.compile("<b>{}</b><br><br>{}<br></li><br><br>{}")
final3 = parse.compile("<b>{}</b></li>")
final4 = parse.compile("<b>{}</b></li><br><br>{}")
parsers = [single, double, final1, final2, final3, final4]
for s in summary:
    result = list(filter(lambda r: r is not None, map(lambda p: p.parse(s), parsers)))[0]
    print(result.fixed)

('HUGE Summerhouse',)
('Extra long garage',)
('Parking for 4 cars',)
('Great internal space',)
('South-facing garden', "Call 24/7 to register your interest or go online to book a viewing at a time that suits you - it's that easy.")


## Price / Rent

In [13]:
string = data["Price / Rent"][3632]
string

"<font color='blue'>&pound;0</font><br>"

In [14]:
price_qualifier = string.split("<br>")[-1]

pattern = "[-+]?[.]?[\d]+(?:,\d\d\d)*[\.]?\d*(?:[eE][-+]?\d+)?"
price = re.findall(pattern, string)[0]

print(price_qualifier)
print(price)


0


# Preprocess records

## Parking spaces

In [15]:
parkings_name = list(filter(lambda s: "parking" in s, data.columns))
parkings = data[parkings_name].copy()
parkings.head()

Unnamed: 0,RTD3307_parking1 - Parking Description,RTD3307_parking2 - Parking Description,RTD3307_parking3 - Parking Description
0,Allocated,Off Street,Residents
1,,,
2,On Street,,
3,,,
4,Driveway,Garage,Off Street


### Types of parking spaces for every property
First obtaining all the parking types in the original dataset, then get the parking types for every property

In [16]:
parkings = parkings.fillna("None")
types, counts = np.unique(parkings.to_numpy().flatten(), return_counts=True)
list(zip(types, counts))

[('Allocated', 343),
 ('Communal', 81),
 ('Covered', 36),
 ('Driveway', 1257),
 ('Garage', 842),
 ('Gated', 89),
 ('None', 6187),
 ('Off Street', 1209),
 ('On Street', 458),
 ('Permit', 70),
 ('Private', 258),
 ('Rear', 27),
 ('Residents', 90)]

In [17]:
def get_parking_spaces_for_property(first: str, second: str, third: str):
    parking_types = {"Allocated": 0, "Off Street": 0, "Residents": 0, "On Street": 0, "Driveway": 0, "Garage": 0,
                     "Permit": 0, "Private": 0, "Gated": 0, "Covered": 0, "Communal": 0, "Rear": 0}

    if first in parking_types.keys():
        parking_types[first] += 1
    if second in parking_types.keys():
        parking_types[second] += 1
    if third in parking_types.keys():
        parking_types[third] += 1

    return parking_types

In [18]:
parking_types = {key: [] for key in types if key != "None"}
for i in range(len(parkings)):
    first = parkings.iloc[i, 0]
    second = parkings.iloc[i, 1]
    third = parkings.iloc[i, 2]
    types = get_parking_spaces_for_property(first, second, third)

    list(map(lambda value, d: parking_types[d].append(types[value]), types.keys(), types.keys()))

In [19]:
for k, v in parking_types.items():
    print("{:10s}\t{:5d}\t{:5d}".format(k, sum(v), len(v)))

Allocated 	  343	 3649
Communal  	   81	 3649
Covered   	   36	 3649
Driveway  	 1257	 3649
Garage    	  842	 3649
Gated     	   89	 3649
Off Street	 1209	 3649
On Street 	  458	 3649
Permit    	   70	 3649
Private   	  258	 3649
Rear      	   27	 3649
Residents 	   90	 3649


### Number of parking spaces for every property

In [20]:
parkings = data[parkings_name]
parking_num = [parkings.iloc[i].count() for i in range(len(parkings))]
parking_num = pd.DataFrame({"parking num": parking_num})
parking_num.head()

Unnamed: 0,parking num
0,3
1,0
2,1
3,0
4,3


In [21]:
parking_num.sum()

parking num    4760
dtype: int64

## Outside spaces

In [22]:
outside_name = list(filter(lambda s: "outside_space" in s, data.columns))
outside = data[outside_name].copy()
outside.head()

Unnamed: 0,RTD3308_outside_space1 - Outside Space Description,RTD3308_outside_space2 - Outside Space Description,RTD3308_outside_space3 - Outside Space Description
0,Communal Garden,,
1,,,
2,Rear Garden,Private Garden,
3,,,
4,Back Garden,Enclosed Garden,Private Garden


### Types of outside spaces for every property

In [23]:
def record_types(first: str, second: str, third: str, types: dict):
    if first in types.keys():
        types[first] += 1
    if second in types.keys():
        types[second] += 1
    if third in types.keys():
        types[third] += 1

    return types

In [24]:
outside = outside.fillna("None")
types, counts = np.unique(outside.to_numpy().flatten(), return_counts=True)
list(zip(types, counts))

[('Back Garden', 1366),
 ('Communal Garden', 227),
 ('Enclosed Garden', 735),
 ('Front Garden', 682),
 ('None', 6012),
 ('Patio', 431),
 ('Private Garden', 599),
 ('Rear Garden', 802),
 ('Terrace', 93)]

In [25]:
outside_types = {key: [] for key in types if key != "None"}
for i in range(len(outside)):
    types = {key: 0 for key in types if key != "None"}
    first = outside.iloc[i, 0]
    second = outside.iloc[i, 1]
    third = outside.iloc[i, 2]
    types = record_types(first, second, third, types)

    list(map(lambda value, d: outside_types[d].append(types[value]), types.keys(), types.keys()))

In [26]:
for k, v in outside_types.items():
    print("{:15s}{:5d}".format(k, sum(v)))

Back Garden     1366
Communal Garden  227
Enclosed Garden  735
Front Garden     682
Patio            431
Private Garden   599
Rear Garden      802
Terrace           93


### Number of outside spaces for every property

In [27]:
outside = data[outside_name]
outside_num = [outside.iloc[i].count() for i in range(len(outside))]
temp = pd.DataFrame({"outside_num": outside_num})
temp.head()

Unnamed: 0,outside_num
0,1
1,0
2,2
3,0
4,3


In [28]:
temp.sum()

outside_num    4935
dtype: int64

## Condition

In [29]:
condition_names = list(filter(lambda s: "condition" in s, data.columns))
condition = data[condition_names].copy()
condition.head()

Unnamed: 0,RTD3316_condition1 - Condition Description
0,Good
1,
2,Good
3,
4,Good


In [30]:
condition = condition.fillna("None")
condition.head()

Unnamed: 0,RTD3316_condition1 - Condition Description
0,Good
1,
2,Good
3,
4,Good


## Heating

In [31]:
heating_names = list(filter(lambda s: "heating" in s, data.columns))
heating = data[heating_names].copy()
heating.head()

Unnamed: 0,RTD3318_heating1 - Heating Description,RTD3318_heating2 - Heating Description,RTD3318_heating3 - Heating Description
0,Double Glazing,Electric,Night Storage
1,,,
2,Central,Gas Central,Double Glazing
3,,,
4,Central,Gas,Double Glazing


In [32]:
heating = heating.fillna("None")
types, count = np.unique(heating.to_numpy().flatten(), return_counts=True)
list(zip(types, count))

[('Air Conditioning', 6),
 ('Central', 476),
 ('Double Glazing', 1550),
 ('Eco-Friendly', 17),
 ('Electric', 183),
 ('Gas', 493),
 ('Gas Central', 1814),
 ('Night Storage', 24),
 ('None', 6270),
 ('Oil', 63),
 ('Solar', 15),
 ('Solar Water', 2),
 ('Under Floor', 34)]

### Types of heating for every property

In [33]:
heating_types = {key: [] for key in types if key != "None"}
for i in range(len(heating)):
    types = {key: 0 for key in types if key != "None"}

    first = heating.iloc[i, 0]
    second = heating.iloc[i, 1]
    third = heating.iloc[i, 2]
    types = record_types(first, second, third, types)

    list(map(lambda d: heating_types[d].append(types[d]), types.keys()))

In [34]:
for k, v in heating_types.items():
    print("{:20s}{:5d}".format(k, sum(v)))

Air Conditioning        6
Central               476
Double Glazing       1550
Eco-Friendly           17
Electric              183
Gas                   493
Gas Central          1814
Night Storage          24
Oil                    63
Solar                  15
Solar Water             2
Under Floor            34


### Number of heating

In [35]:
heating = data[heating_names]
heating_num = [heating.iloc[i].count() for i in range(len(heating))]
temp = pd.DataFrame({"heating_num": heating_num})
temp.head()

Unnamed: 0,heating_num
0,3
1,0
2,3
3,0
4,3


In [36]:
temp.sum()

heating_num    4677
dtype: int64

## Accessibility

In [37]:
accessibility_name = list(filter(lambda s: "accessibility" in s, data.columns))
accessibility = data[accessibility_name].copy()
accessibility.head()

Unnamed: 0,RTD3317_accessibility1 - accessibility Description,RTD3317_accessibility2 - accessibility Description,RTD3317_accessibility3 - accessibility Description
0,Not suitable for wheelchair users,,
1,,,
2,,,
3,,,
4,Level access,,


In [38]:
accessibility = accessibility.fillna("None")
types, count = np.unique(accessibility.to_numpy().flatten(), return_counts=True)

### Types of accessibility

In [39]:
accessibility_type = {key: [] for key in types if key != "None"}
for i in range(len(accessibility)):
    types = {key: 0 for key in types if key != "None"}

    first = accessibility.iloc[i, 0]
    second = accessibility.iloc[i, 1]
    third = accessibility.iloc[i, 2]
    types = record_types(first, second, third, types)

    list(map(lambda d: accessibility_type[d].append(types[d]), types.keys()))

In [40]:
for k, v in accessibility_type.items():
    print("{:35s}{:5d}".format(k, sum(v)))

Level access                         522
Lift access                           64
Not suitable for wheelchair users    779
Ramped access                         24
Wet room                              19
Wide doorways                         47


### Number of accessibility

In [41]:
accessibility = data[accessibility_name]
accessibility_num = [accessibility.iloc[i].count() for i in range(len(accessibility))]
temp = pd.DataFrame({"accessibility_num": accessibility_num})
temp

Unnamed: 0,accessibility_num
0,1
1,0
2,0
3,0
4,1
...,...
3644,0
3645,0
3646,0
3647,0


In [42]:
temp.sum()

accessibility_num    1455
dtype: int64

## Rooms

In [43]:
extract_area = parse.compile("{} ({} sqm){}")

### Bedrooms

In [44]:
info = data["EweMove Description S3 Rooms"]
handler = ProcessHTML()
for i in info:
    handler.EweMove_Description_S3_Rooms(i)

In [144]:
bedroom_names = [i for i in handler.s3_rooms_set if "Bedroom" in i]
bedroom_names

['Bedroom 6',
 'Attic Bedroom',
 'Bedroom (Double)',
 'Bedroom 8',
 'Bedroom 4',
 'Loft Bedroom',
 'Bedroom 5',
 'Bedroom 3',
 'Bedroom 1',
 'Bedroom (Single) with Ensuite',
 'Bedroom 7',
 'Bedroom 2',
 'Bedroom (Single)',
 'Bedroom/Living Room',
 'Master Bedroom with Ensuite',
 'Bedroom',
 'Master Bedroom',
 'Bedroom (Double) with Ensuite']

In [72]:
for room in handler.s3_rooms[: 5]:
    if room is None: continue
    for k, v in room.items():
        if k in bedroom_names:
            print("{:30s}{}".format(k, v))

Living/Dining Room            6.58m x 3.78m (24.8 sqm) - 21' 7" x 12' 4" (267 sqft)
Bedroom 1                     3.37m x 2.45m (8.2 sqm) - 11' x 8' (88 sqft)
Bedroom 2                     2.54m x 2.45m (6.2 sqm) - 8' 4" x 8' (67 sqft)
Bathroom                      2.14m x 2.04m (4.3 sqm) - 7' x 6' 8" (46 sqft)
Bedroom 1                     4.07m x 2.43m (9.8 sqm) - 13' 4" x 7' 11" (106 sqft)
Shower Room                   0.79m x 1.57m (1.2 sqm) - 2' 7" x 5' 1" (13 sqft)
Bedroom 2                     3.48m x 2.43m (8.4 sqm) - 11' 5" x 7' 11" (91 sqft)
Utility Room                  1.6m x 2.76m (4.4 sqm) - 5' 2" x 9' (47 sqft)
Downstairs Cloakroom          1.68m x 1.1m (1.8 sqm) - 5' 6" x 3' 7" (19 sqft)
Kitchen /  Dining Room        5.33m x 4.58m (24.4 sqm) - 17' 5" x 15' (262 sqft)
Bathroom                      1.98m x 2.37m (4.6 sqm) - 6' 5" x 7' 9" (50 sqft)
Bedroom 3                     5.74m x 2.31m (13.2 sqm) - 18' 9" x 7' 6" (142 sqft)
Bedroom 4                     5.34m x 3.13m

#### Obtain the maximum number of bedrooms in a property

In [73]:
bedrooms_num = []
for room in handler.s3_rooms:
    if room is None:
        bedrooms_num.append(0)
        continue

    bedrooms_num.append(len([key for key in room.keys() if "Bedroom" in key]))
max_bedroom_num = max(bedrooms_num)

#### Bedroom area

In [74]:
bedroom_names = ["Bedroom {}".format(i + 1) for i in range(max_bedroom_num)]
bedrooms = []
for i in range(len(handler.s3_rooms)):
    bedroom_per_property = {key: 0 for key in bedroom_names}
    if handler.s3_rooms[i] is None:
        bedrooms.append(bedroom_per_property)
        continue

    num = 1
    for k, v in handler.s3_rooms[i].items():
        if "Bedroom" not in k:
            continue

        if not isinstance(v, str):
            bedroom_per_property["Bedroom {}".format(num)] = 0
            num += 1
            continue

        bedroom_per_property["Bedroom {}".format(num)] = float(extract_area.parse(handler.s3_rooms[i][k])[1])
        num += 1

    bedrooms.append(bedroom_per_property)

In [75]:
bedrooms = pd.DataFrame(bedrooms)
bedrooms.head()

Unnamed: 0,Bedroom 1,Bedroom 2,Bedroom 3,Bedroom 4,Bedroom 5,Bedroom 6,Bedroom 7,Bedroom 8
0,8.2,6.2,0.0,0.0,0.0,0.0,0.0,0.0
1,9.8,8.4,13.2,16.7,0.0,0.0,0.0,0.0
2,10.4,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,11.0,9.7,7.4,7.4,0.0,0.0,0.0,0.0


### Kitchen

In [50]:
for room in handler.s3_rooms[: 50]:
    if room is None: continue
    for k, v in room.items():
        if "kitchen" in k.lower():
            print("{:30s}{}".format(k, v))

Kitchen                       2.68m x 2.14m (5.7 sqm) - 8' 9" x 7' (61 sqft)
Kitchen /  Dining Room        5.33m x 4.58m (24.4 sqm) - 17' 5" x 15' (262 sqft)
Kitchen                       2.97m x 2.36m (7 sqm) - 9' 8" x 7' 8" (75 sqft)
Kitchen                       3.99m x 2.61m (10.4 sqm) - 13' 1" x 8' 6" (112 sqft)
Kitchen                       4.2m x 3.5m (14.7 sqm) - 13' 9" x 11' 5" (158 sqft)
Kitchen                       3.12m x 2.83m (8.8 sqm) - 10' 2" x 9' 3" (95 sqft)
Kitchen /  Dining Room        4.5m x 3.7m (16.6 sqm) - 14' 9" x 12' 1" (179 sqft)
Kitchen Diner                 4.76m x 2.78m (13.2 sqm) - 15' 7" x 9' 1" (142 sqft)
Kitchen /  Dining Room        2.82m x 4.46m (12.5 sqm) - 9' 3" x 14' 7" (135 sqft)
Kitchen                       1
Kitchen                       7.49m x 3.1m (23.2 sqm) - 24' 6" x 10' 2" (249 sqft)
Kitchen /  Dining Room        6.5m x 3.73m (24.2 sqm) - 21' 3" x 12' 2" (260 sqft)
Kitchen                       3.3m x 3m (9.9 sqm) - 10' 9" x 9' 10" (106

In [51]:
kitchens = []

for i in range(len(handler.s3_rooms)):
    kitchen_per_property = 0
    if handler.s3_rooms[i] is None:
        kitchens.append(kitchen_per_property)
        continue

    for k, v in handler.s3_rooms[i].items():
        if "kitchen" in k.lower():
            kitchen_per_property = 1
            break

    kitchens.append(kitchen_per_property)

In [52]:
kitchens = pd.DataFrame({"Kitchen": kitchens})
kitchens.head()

Unnamed: 0,Kitchen
0,1
1,1
2,1
3,0
4,1


### Living/Reception

In [53]:
living_room_names = [i for i in handler.s3_rooms_set if ("living" in i.lower() or "reception" in i.lower()) and "kitchen" not in i.lower()]
for room in handler.s3_rooms[: 50]:
    if room is None: continue
    for k, v in room.items():
        if k in living_room_names:
            print("{:30s}{}".format(k, v))

Living/Dining Room            6.58m x 3.78m (24.8 sqm) - 21' 7" x 12' 4" (267 sqft)
Living Room                   4.34m x 4.11m (17.8 sqm) - 14' 3" x 13' 5" (192 sqft)
Living Room                   4.9m x 3.7m (18.1 sqm) - 16' x 12' 1" (195 sqft)
Living Room                   4.16m x 8.69m (36.1 sqm) - 13' 7" x 28' 6" (389 sqft)
Living Room                   4.5m x 3.7m (16.6 sqm) - 14' 9" x 12' 1" (179 sqft)
Living Room                   3.83m x 3.58m (13.7 sqm) - 12' 6" x 11' 8" (147 sqft)
Reception Room                3.68m x 3.68m (13.5 sqm) - 12' x 12' (145 sqft)
Living Room                   3.63m x 4.25m (15.4 sqm) - 11' 10" x 13' 11" (166 sqft)
Living Room                   3m x 4.12m (12.3 sqm) - 9' 10" x 13' 6" (133 sqft)
Living Room                   3.6m x 3.73m (13.4 sqm) - 11' 9" x 12' 2" (144 sqft)


In [54]:
receptions = []
for i in range(len(handler.s3_rooms)):
    reception_per_property = 0
    if handler.s3_rooms[i] is None:
        receptions.append(reception_per_property)
        continue

    for k, v in handler.s3_rooms[i].items():
        if k in living_room_names and isinstance(v, str):
            reception_per_property += float(extract_area.parse(handler.s3_rooms[i][k])[1])

    receptions.append(reception_per_property)

In [55]:
receptions = pd.DataFrame({"Reception": receptions})
receptions.head()

Unnamed: 0,Reception
0,24.8
1,0.0
2,0.0
3,0.0
4,0.0


### Bathroom/WC/Washroom

In [56]:
bathroom_names = [i for i in handler.s3_rooms_set if "bathroom" in i.lower() or "WC" in i.lower()]
for room in handler.s3_rooms[: 50]:
    if room is None: continue
    for k, v in room.items():
        if k in bathroom_names:
            print("{:30s}{}".format(k, v))

Bathroom                      2.14m x 2.04m (4.3 sqm) - 7' x 6' 8" (46 sqft)
Bathroom                      1.98m x 2.37m (4.6 sqm) - 6' 5" x 7' 9" (50 sqft)
Bathroom                      1
Family Bathroom               1
Bathroom                      2.5m x 1.7m (4.2 sqm) - 8' 2" x 5' 6" (45 sqft)
Bathroom                      1
Bathroom                      2.24m x 1.89m (4.2 sqm) - 7' 4" x 6' 2" (45 sqft)
Bathroom                      2.08m x 1.81m (3.7 sqm) - 6' 9" x 5' 11" (40 sqft)
Bathroom                      1
Family Bathroom               3.29m x 3.17m (10.4 sqm) - 10' 9" x 10' 4" (112 sqft)
Family Bathroom               2.5m x 2m (5 sqm) - 8' 2" x 6' 6" (53 sqft)
Bathroom                      2.43m x 1.67m (4 sqm) - 7' 11" x 5' 5" (43 sqft)
Bathroom                      1.5m x 2.9m (4.3 sqm) - 4' 11" x 9' 6" (46 sqft)
Bathroom                      3.15m x 2.66m (8.3 sqm) - 10' 4" x 8' 8" (90 sqft)
Bathroom                      1.6m x 3.32m (5.3 sqm) - 5' 2" x 10' 10" (57 sqft

In [57]:
bathrooms = []
for i in range(len(handler.s3_rooms)):
    bathroom_per_property = {key: 0 for key in bathroom_names}
    if handler.s3_rooms[i] is None:
        bathrooms.append(bathroom_per_property)
        continue

    for k, v in handler.s3_rooms[i].items():
        if k in bathroom_names and isinstance(v, str):
            bathroom_per_property[k] = float(extract_area.parse(handler.s3_rooms[i][k])[1])

    bathrooms.append(bathroom_per_property)

In [58]:
bathrooms = pd.DataFrame(bathrooms)
bathrooms.head()

Unnamed: 0,Master Bathroom,Bathroom,Family Bathroom,Ensuite Bathroom,Guest Bathroom
0,0.0,4.3,0.0,0.0,0.0
1,0.0,4.6,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0


### Dining

In [59]:
dining_names = [i for i in handler.s3_rooms_set if ("dining" in i.lower() or "breakfast" in i.lower()) and "kitchen" not in i.lower() and "living" not in i.lower()]
for room in handler.s3_rooms[: 50]:
    if room is None: continue
    for k, v in room.items():
        if k in dining_names:
            print("{:30s}{}".format(k, v))

Dining Room                   3.5m x 3m (10.5 sqm) - 11' 5" x 9' 10" (113 sqft)
Dining Room                   3.13m x 2.83m (8.8 sqm) - 10' 3" x 9' 3" (95 sqft)
Dining Room                   4.3m x 3.13m (13.4 sqm) - 14' 1" x 10' 3" (144 sqft)
Dining Room                   3.1m x 2.6m (8 sqm) - 10' 2" x 8' 6" (86 sqft)
Dining Room                   5.09m x 3.7m (18.8 sqm) - 16' 8" x 12' 1" (203 sqft)
Dining Room                   3.7m x 4.12m (15.2 sqm) - 12' 1" x 13' 6" (164 sqft)


In [60]:
dining_rooms = []
for i in range(len(handler.s3_rooms)):
    dining_room_per_property = 0
    if handler.s3_rooms[i] is None:
        dining_rooms.append(dining_room_per_property)
        continue

    for k, v in handler.s3_rooms[i].items():
        if k in dining_names and isinstance(v, str):
            dining_room_per_property += float(extract_area.parse(v)[1])

    dining_rooms.append(dining_room_per_property)

In [61]:
dining_rooms = pd.DataFrame({"Dining room": dining_rooms})
dining_rooms.iloc[:10]

Unnamed: 0,Dining room
0,0.0
1,0.0
2,0.0
3,0.0
4,0.0
5,10.5
6,8.8
7,0.0
8,0.0
9,0.0


### Work/study

In [89]:
work_names = [i for i in handler.s3_rooms_set if "work" in i.lower() or "study" in i.lower() or "office" in i.lower()]
for room in handler.s3_rooms[: 300]:
    if room is None: continue
    for k, v in room.items():
        if k in work_names:
            print("{:30s}{}".format(k, v))

Study                         2.27m x 1.98m (4.5 sqm) - 7' 5" x 6' 5" (48 sqft)
Garage / Office               5m x 3.1m (15.5 sqm) - 16' 4" x 10' 2" (166 sqft)
Study                         2.16m x 3.14m (6.7 sqm) - 7' 1" x 10' 3" (73 sqft)
Study Area                    2.5m x 1.85m (4.6 sqm) - 8' 2" x 6' (49 sqft)
Study                         5.7m x 2m (11.4 sqm) - 18' 8" x 6' 6" (122 sqft)
Workspace                     8.69m x 2m (17.4 sqm) - 28' 6" x 6' 6" (187 sqft)
Study                         3m x 3m (9 sqm) - 9' 10" x 9' 10" (96 sqft)
Workshop                      14m x 3.6m (50.4 sqm) - 45' 11" x 11' 9" (542 sqft)
Study/Family Room/Play Room   4m x 2.91m (11.6 sqm) - 13' 1" x 9' 6" (125 sqft)
Study                         3.47m x 2.18m (7.5 sqm) - 11' 4" x 7' 1" (81 sqft)
Workshop                      5.09m x 2.14m (10.9 sqm) - 16' 8" x 7' (117 sqft)
Study                         2.1m x 2.1m (4.4 sqm) - 6' 10" x 6' 10" (47 sqft)
Study Area                    2.8m x 2.54m (7.1

In [90]:
work_spaces = []
for room in handler.s3_rooms:
    work_per_property = 0
    if room is None:
        work_spaces.append(work_per_property)
        continue

    for k, v in room.items():
        if k in work_names and isinstance(v, str):
            work_per_property += float(extract_area.parse(v)[1])

    work_spaces.append(work_per_property)

In [91]:
work_spaces = pd.DataFrame({"work space": work_spaces})
work_spaces

Unnamed: 0,work space
0,0.0
1,0.0
2,0.0
3,0.0
4,4.5
...,...
3644,0.0
3645,0.0
3646,0.0
3647,0.0


### Other rooms

In [139]:
bedroom_names = set([i for i in handler.s3_rooms_set if "Bedroom" in i])
kitchen_names = set(i for i in handler.s3_rooms_set if "kitchen" in i.lower())
reception_names = set(i for i in handler.s3_rooms_set if ("living" in i.lower() or "reception" in i.lower()) and "kitchen" not in i.lower())
garden_names = set([i for i in handler.s3_rooms_set if "garden" in i.lower() or "yard" in i.lower()])
bathroom_names = set(i for i in handler.s3_rooms_set if "bathroom" in i.lower() or "wc" in i.lower() or "washroom" in i.lower())
dining_names = set(i for i in handler.s3_rooms_set if ("dining" in i.lower() or "breakfast" in i.lower()) and "kitchen" not in i.lower() and "living" not in i.lower())
work_names = set([i for i in handler.s3_rooms_set if "work" in i.lower() or "study" in i.lower() or "office" in i.lower()])
parking = set(i for i in handler.s3_rooms_set if "car" in i.lower() or "garage" in i.lower() or "driveway" in i.lower() or "park" in i.lower())

rest_names = handler.s3_rooms_set - bedroom_names - kitchen_names - reception_names - garden_names - bathroom_names - dining_names - work_names - parking
rest_names

{'Airing Cupboard',
 'Annexe',
 'Approach',
 'Attic',
 'Balcony',
 'Barn',
 'Basement',
 'Boiler Room',
 'Boot Room',
 'Cabin',
 'Cellar',
 'Cinema Room',
 'Cloakroom',
 'Coat Room',
 'Conservatory',
 'Day Room',
 'Decking',
 'Den',
 'Downstairs Cloakroom',
 'Drawing Room',
 'Dressing Room',
 'Electric Gate',
 'Ensuite',
 'Ensuite Shower Room',
 'Entrance Hall',
 'Entrance Porch',
 'Extension',
 'Exterior',
 'Family Room',
 'First Floor Landing',
 'Foyer',
 'Front Access',
 'Front Porch',
 'Front Room',
 'Frontroom',
 'Galleried Landing',
 'Gallery Landing',
 'Games Room',
 'Green House',
 'Guest Room',
 'Gym',
 'Hall',
 'Hallway',
 'Hobby Room',
 'Hot Tub',
 'Inner Hall',
 'Inner Lobby',
 'Jack & Jill Ensuite',
 'Land',
 'Landing',
 'Larder',
 'Laundry Room',
 'Lift',
 'Lobby',
 'Loft',
 'Loft Room',
 'Lounge',
 'Lounge Diner',
 'Lower Ground Floor',
 'Man Cave',
 'Manège',
 'Mezzanine',
 'Morning Room',
 'Open Front Porch',
 'Orangery',
 'Outbuilding',
 'Outside Storage',
 'Paddock',

In [108]:
for room in handler.s3_rooms[:50]:
    if room is None: continue
    for k, v in room.items():
        if k in rest_names:
            print("{:30s}{}".format(k, v))

Entrance Hall                 1
Porch                         1.48m x 2.31m (3.4 sqm) - 4' 10" x 7' 6" (36 sqft)
Hallway                       5.03m x 2.77m (13.9 sqm) - 16' 6" x 9' 1" (149 sqft)
Landing                       2.41m x 1.06m (2.5 sqm) - 7' 10" x 3' 5" (27 sqft)
Lounge                        4.33m x 5.36m (23.2 sqm) - 14' 2" x 17' 7" (249 sqft)
Entrance Porch                1
Lounge Diner                  6.76m x 4.04m (27.3 sqm) - 22' 2" x 13' 3" (293 sqft)
Lobby                         1
Landing                       1
Entrance Hall                 1
Lounge Diner                  6.66m x 3.1m (20.6 sqm) - 21' 10" x 10' 2" (222 sqft)
Conservatory                  1
Lounge                        5.4m x 3.5m (18.9 sqm) - 17' 8" x 11' 5" (203 sqft)
Vestibule                     2.2m x 1.7m (3.7 sqm) - 7' 2" x 5' 6" (40 sqft)
Hallway                       1
First Floor Landing           1
Cellar                        2m x 2m (4 sqm) - 6' 6" x 6' 6" (43 sqft)
Entrance Hall  

In [115]:
other_rooms = []
for room in handler.s3_rooms:
    other_room_per_property = 0
    if room is None:
        other_rooms.append(other_room_per_property)
        continue

    for k, v in room.items():
        if k in rest_names:
            other_room_per_property += 1
    other_rooms.append(other_room_per_property)

In [117]:
other_rooms = pd.DataFrame({"Other rooms": other_rooms})
other_rooms.head()

Unnamed: 0,Other rooms
0,1
1,4
2,4
3,0
4,3


In [140]:
def get_rooms(dataset, *args):
    room_names = []
    for room in dataset:
        for arg in args:
            if arg in room.lower():
                room_names.append(room)
    return room_names

result = get_rooms(handler.s3_rooms_set, "bedroom")
result

['Bedroom 6',
 'Attic Bedroom',
 'Bedroom (Double)',
 'Bedroom 8',
 'Bedroom 4',
 'Loft Bedroom',
 'Bedroom 5',
 'Bedroom 3',
 'Bedroom 1',
 'Bedroom (Single) with Ensuite',
 'Bedroom 7',
 'Bedroom 2',
 'Bedroom (Single)',
 'Bedroom/Living Room',
 'Master Bedroom with Ensuite',
 'Bedroom',
 'Master Bedroom',
 'Bedroom (Double) with Ensuite']

In [141]:
for i in result:
    if i not in bedroom_names:
        print("False")
        break

In [142]:
len(result), len(bedroom_names)

(18, 18)

In [143]:
dataset = {"a", "b", "c", "d", "dd", "aa"}
result = get_rooms(dataset, "a", "d")
test = [i for i in dataset if "a" in i or "d" in i]

result, test

(['dd', 'aa', 'a', 'd'], ['dd', 'aa', 'a', 'd'])