# Import packages

In [7]:
import pandas as pd
import numpy as np
import parse
import re
import seaborn as sns
from bs4 import BeautifulSoup
from matplotlib import pyplot as plt

from ProcessHTML import ProcessHTML
from ExtractRooms import ExtractRooms
from GeneralizeDataset import GeneralizeDataset

# Preprocess dataset
## HTML Texts

In [2]:
filename = "../datasets/PropertyData_wDesc.csv"
data = pd.read_csv(filename, encoding="ISO8859-1")

In [3]:
handler = ProcessHTML()

rooms = data["EweMove Description S3 Rooms"]
price = data["Price / Rent"]

for room in rooms:
    handler.EweMove_Description_S3_Rooms(room)
for p in price:
    handler.price_rent(p)

In [4]:
indices = set(range(len(data)))
room_indices = set(i for i in range(len(handler.s3_rooms)) if handler.s3_rooms[i] is not None)
price_indices = set(i for i in range(len(handler.price_or_rent)) if handler.price_or_rent[i][0] != 0)

In [5]:
valid_indices = indices & room_indices & price_indices

## Categorical data

In [6]:
generalize = GeneralizeDataset(data)

parking = generalize.get_feature_num("parking")
outside = generalize.get_feature_num("outside_space")
heating = generalize.get_feature_num("heating")
accessibility = generalize.get_feature_num("accessibility")

KeyboardInterrupt: 

In [None]:
condition_indices = set(i for i in indices if data["RTD3316_condition1 - Condition Description"].notna()[i])
qualifier_indices = set(i for i in indices if data["Price Qualifier"].notna()[i])
council_tax_indices = set(i for i in indices if data["DESC Council Tax Band"].notna()[i])

In [None]:
valid_indices = valid_indices & condition_indices & qualifier_indices & council_tax_indices

# Obtain the dataset for model input

In [None]:
parking_names = [i for i in data.columns if "parking" in i]
outside_names = [i for i in data.columns if "outside" in i]
heating_names = [i for i in data.columns if "heating" in i]
accessibility_names = [i for i in data.columns if "accessibility" in i]
condition_names = [i for i in data.columns if "condition" in i]
column_names = ["Postcode", "Sale or Let", "EweMove Description S3 Rooms", "Price / Rent",
                "Price Qualifier", "DESC Council Tax Band", "# of Enquiry or viewings", "# of Apps/Offers"]
column_names += parking_names + outside_names + heating_names + accessibility_names + condition_names

valid_indices = sorted(list(valid_indices))
input_data = data.iloc[valid_indices][column_names]
input_data.head()

In [None]:
general_data = input_data.loc[:, ~input_data.columns.isin(["EweMove Description S3 Rooms", "Price / Rent"])]
general_data.head()

## Price for sale and rental

In [None]:
prices = [handler.price_or_rent[i][0] for i in valid_indices]
prices = pd.DataFrame(prices)
rename_dict = {i: j for i, j in zip(range(len(prices)), valid_indices)}
prices = prices.rename(index=rename_dict)

### Sale price

In [None]:
sale_prices = prices[general_data["Sale or Let"] == "Sale"]
sns.displot(sale_prices)
plt.title("Sale Price")

In [None]:
rental_price = prices[general_data["Sale or Let"] == "Rental"]
sns.displot(rental_price)
plt.title("Rental Price")

## Extract room information

In [None]:
rooms = [handler.s3_rooms[i] for i in valid_indices]
extract_room = ExtractRooms(rooms, handler.s3_rooms_set, "{} ({} sqm){}")

### Bedrooms

In [None]:
bedrooms = extract_room.get_rooms("bedroom")

In [None]:
bedrooms = pd.DataFrame(bedrooms)
rename_dict = {str(i): "bedroom {}".format(i + 1) for i in range(6)}
bedrooms = bedrooms.rename(columns=rename_dict)
bedrooms.head()

### Kitchens

In [None]:
kitchens = extract_room.get_rooms("kitchen", operation="number")

In [None]:
kitchens = pd.DataFrame(kitchens)
kitchens = kitchens.rename(columns={0: "kitchen number"})
kitchens.head()

### Living/Reception

In [None]:
receptions = extract_room.get_rooms("living", "reception", operation="sum")

In [None]:
receptions = pd.DataFrame(receptions)
receptions = receptions.rename(columns={0: "living area"})
receptions.head()

### Bathrooms

In [None]:
bathrooms = extract_room.get_rooms("bathroom", "wc", "washroom", operation="number")

In [None]:
bathrooms = pd.DataFrame(bathrooms)
bathrooms = bathrooms.rename(columns={0: "bathroom number"})
bathrooms.head()

### Dining

In [None]:
dining_rooms = extract_room.get_rooms("dining", operation="number")

In [None]:
dining_rooms = pd.DataFrame(dining_rooms)
dining_rooms = dining_rooms.rename(columns={0: "dining number"})
dining_rooms.head()

### Other spaces

In [None]:
others = extract_room.get_rest_rooms()

In [None]:
others = pd.DataFrame(others)
others = others.rename(columns={0: "other number", 1: "other area"})
others.head()

In [None]:
room_info = pd.concat([bedrooms, kitchens, receptions, bathrooms, dining_rooms, others], axis=1)
room_info.head()