# Import packages

In [1]:
import pandas as pd
import numpy as np
import parse
import re
import seaborn as sns
from bs4 import BeautifulSoup
from matplotlib import pyplot as plt

from ProcessHTML import ProcessHTML
from ExtractRooms import ExtractRooms
from GeneralizeDataset import GeneralizeDataset

# Preprocess dataset
## HTML Texts

In [2]:
filename = "../datasets/PropertyData_wDesc.csv"
data = pd.read_csv(filename, encoding="ISO8859-1")

In [3]:
handler = ProcessHTML()

rooms = data["EweMove Description S3 Rooms"]
price = data["Price / Rent"]

for room in rooms:
    handler.EweMove_Description_S3_Rooms(room)
for p in price:
    handler.price_rent(p)

In [4]:
indices = set(range(len(data)))
room_indices = set(i for i in range(len(handler.s3_rooms)) if handler.s3_rooms[i] is not None)
price_indices = set(i for i in range(len(handler.price_or_rent)) if handler.price_or_rent[i][0] != 0)

In [5]:
valid_indices = indices & room_indices & price_indices

## Categorical data

In [6]:
generalize = GeneralizeDataset(data)

parking = generalize.get_feature_num("parking")
outside = generalize.get_feature_num("outside_space")
heating = generalize.get_feature_num("heating")
accessibility = generalize.get_feature_num("accessibility")

In [7]:
condition_indices = set(i for i in indices if data["RTD3316_condition1 - Condition Description"].notna()[i])
qualifier_indices = set(i for i in indices if data["Price Qualifier"].notna()[i])
council_tax_indices = set(i for i in indices if data["DESC Council Tax Band"].notna()[i])

In [8]:
valid_indices = valid_indices & condition_indices & qualifier_indices & council_tax_indices

# Obtain the dataset for model input

In [9]:
parking_names = [i for i in data.columns if "parking" in i]
outside_names = [i for i in data.columns if "outside" in i]
heating_names = [i for i in data.columns if "heating" in i]
accessibility_names = [i for i in data.columns if "accessibility" in i]
condition_names = [i for i in data.columns if "condition" in i]
column_names = ["Postcode", "Sale or Let", "EweMove Description S3 Rooms", "Price / Rent",
                "Price Qualifier", "DESC Council Tax Band", "# of Enquiry or viewings", "# of Apps/Offers"]
column_names += parking_names + outside_names + heating_names + accessibility_names + condition_names

valid_indices = sorted(list(valid_indices))
input_data = data.iloc[valid_indices][column_names]
input_data.head()

Unnamed: 0,Postcode,Sale or Let,EweMove Description S3 Rooms,Price / Rent,Price Qualifier,DESC Council Tax Band,# of Enquiry or viewings,# of Apps/Offers,RTD3307_parking1 - Parking Description,RTD3307_parking2 - Parking Description,...,RTD3308_outside_space1 - Outside Space Description,RTD3308_outside_space2 - Outside Space Description,RTD3308_outside_space3 - Outside Space Description,RTD3318_heating1 - Heating Description,RTD3318_heating2 - Heating Description,RTD3318_heating3 - Heating Description,RTD3317_accessibility1 - accessibility Description,RTD3317_accessibility2 - accessibility Description,RTD3317_accessibility3 - accessibility Description,RTD3316_condition1 - Condition Description
0,LU7 4WN,Sale,This home includes:<ul><li><strong>01 - Entran...,"<font color='blue'>&pound;140,000</font><br>Of...",Offers In Excess Of,Band B,32,12,Allocated,Off Street,...,Communal Garden,,,Double Glazing,Electric,Night Storage,Not suitable for wheelchair users,,,Good
2,DA17 5PJ,Sale,This home includes:<ul><li><strong>01 - Entran...,"<font color='blue'>&pound;325,000</font><br>Gu...",Guide Price,Band D,14,4,On Street,,...,Rear Garden,Private Garden,,Central,Gas Central,Double Glazing,,,,Good
6,RG26 5PX,Sale,This home includes:<ul><li><strong>01 - Living...,"<font color='blue'>&pound;500,000</font><br>Gu...",Guide Price,Band E,10,2,Driveway,Garage,...,Back Garden,Enclosed Garden,Patio,Double Glazing,Gas Central,Under Floor,,,,Good
7,BD8 0HT,Sale,This home includes:<ul><li><strong>01 - Hallwa...,"<font color='blue'>&pound;170,000</font><br>Of...",Offers in Region Of,Band B,9,1,Driveway,,...,Front Garden,Rear Garden,,Gas Central,,,Level access,,,Good
15,HU17 7AB,Sale,This home includes:<ul><li><strong>01 - Entran...,"<font color='blue'>&pound;410,000</font><br>Of...",Offers Over,Band B,1,2,Off Street,Allocated,...,Rear Garden,,,Gas Central,,,Level access,,,Good


In [15]:
general_data = input_data.loc[:, ~input_data.columns.isin(["EweMove Description S3 Rooms", "Price / Rent"])]
general_data.head()

Unnamed: 0,Postcode,Sale or Let,Price Qualifier,DESC Council Tax Band,# of Enquiry or viewings,# of Apps/Offers,RTD3307_parking1 - Parking Description,RTD3307_parking2 - Parking Description,RTD3307_parking3 - Parking Description,RTD3308_outside_space1 - Outside Space Description,RTD3308_outside_space2 - Outside Space Description,RTD3308_outside_space3 - Outside Space Description,RTD3318_heating1 - Heating Description,RTD3318_heating2 - Heating Description,RTD3318_heating3 - Heating Description,RTD3317_accessibility1 - accessibility Description,RTD3317_accessibility2 - accessibility Description,RTD3317_accessibility3 - accessibility Description,RTD3316_condition1 - Condition Description
0,LU7 4WN,Sale,Offers In Excess Of,Band B,32,12,Allocated,Off Street,Residents,Communal Garden,,,Double Glazing,Electric,Night Storage,Not suitable for wheelchair users,,,Good
2,DA17 5PJ,Sale,Guide Price,Band D,14,4,On Street,,,Rear Garden,Private Garden,,Central,Gas Central,Double Glazing,,,,Good
6,RG26 5PX,Sale,Guide Price,Band E,10,2,Driveway,Garage,Off Street,Back Garden,Enclosed Garden,Patio,Double Glazing,Gas Central,Under Floor,,,,Good
7,BD8 0HT,Sale,Offers in Region Of,Band B,9,1,Driveway,,,Front Garden,Rear Garden,,Gas Central,,,Level access,,,Good
15,HU17 7AB,Sale,Offers Over,Band B,1,2,Off Street,Allocated,,Rear Garden,,,Gas Central,,,Level access,,,Good


In [16]:
rooms = [i for i in range]

[0,
 2,
 6,
 7,
 15,
 19,
 28,
 118,
 121,
 127,
 155,
 157,
 165,
 167,
 168,
 170,
 172,
 177,
 191,
 193,
 209,
 212,
 217,
 219,
 223,
 224,
 225,
 230,
 232,
 233,
 237,
 240,
 242,
 243,
 314,
 323,
 344,
 345,
 347,
 348,
 350,
 351,
 353,
 357,
 359,
 360,
 362,
 363,
 364,
 365,
 366,
 368,
 371,
 373,
 375,
 376,
 377,
 378,
 379,
 380,
 382,
 384,
 385,
 386,
 387,
 388,
 390,
 391,
 392,
 393,
 394,
 395,
 396,
 397,
 398,
 399,
 400,
 401,
 403,
 406,
 409,
 410,
 411,
 412,
 414,
 415,
 417,
 418,
 419,
 420,
 425,
 427,
 428,
 429,
 430,
 431,
 432,
 433,
 435,
 438,
 440,
 441,
 442,
 445,
 449,
 450,
 451,
 478,
 479,
 480,
 481,
 482,
 483,
 485,
 488,
 489,
 490,
 492,
 496,
 507,
 522,
 525,
 527,
 530,
 531,
 534,
 536,
 537,
 548,
 554,
 555,
 559,
 562,
 565,
 566,
 567,
 573,
 574,
 576,
 578,
 579,
 581,
 582,
 592,
 593,
 594,
 595,
 597,
 598,
 599,
 600,
 601,
 604,
 605,
 626,
 627,
 629,
 631,
 632,
 633,
 634,
 635,
 636,
 637,
 638,
 639,
 640,
 641,
 64