In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:95% !important; }</style>"))

In [2]:
#Essentials
import numpy as np
import pandas as pd
import pickle
import re
import datetime as dt
from dateutil.relativedelta import *
import time
import seaborn as sns

#SQL related - NEED TO DECIDE WHICH ONE I'LL BE USING AND DELETE THE REST
import sqlite3
import pandas.io.sql as pd_sql
# import psycopg2
# from sqlalchemy import create_engine

pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 6000)

In [48]:
def get_energy_label(value):
    """Takes 'Energy label' column and strips it from the words 'What does this mean?'
    Return just the label"""
    no_touch_list = ['Not required', 'Not available', np.NaN]
    if value not in no_touch_list:
        return value[0]
    else: 
        return np.NaN

def get_int(value):
    """Trims the price, area and other fields with numbers and converts them into int"""
    try:
        return re.sub('[€\sk.,m²m³v.o.n.permonthBeforeAfter]', '', value)
    except:
        return np.NaN

def get_listed_date(value):
    """Whenever possible, converts text date value to date format (takes "Listen Since" field value as an argument)
    Returns the date value and a boolean True or False to identify whether it successfully converted the date or not."""
    try:
        listed_date = dt.datetime.strptime(value, '%B %d, %Y')
        listed_date = listed_date.date()
        return listed_date, True
    except:
        return np.NaN, False
    
    
def get_listed_days(value, scraped_date):
    """Converts a string in the 'Listed since' column into a value for number of days listed for"""
    if scraped_date == np.NaN:
        scraped_date = dt.date(2020, 2, 9)
    
    try:
        listed_date = dt.datetime.strptime(value, '%B %d, %Y')
        days_listed = (scraped_date - listed_date).days
    except:
        if 'Today' in value:
            listed_days = 1
        elif 'week' in value:
            weeks_listed = int(re.search('\d*', value).group())
            listed_date = scraped_date - relativedelta(weeks=weeks_listed)
            days_listed = (scraped_date - listed_date).days
        elif 'month' in value:
            months_listed = int(re.search('\d*', value).group())
            listed_date = scraped_date - relativedelta(months=months_listed)
            days_listed = (scraped_date - listed_date).days
        elif '6+' in value:
            listed_date = scraped_date - relativedelta(years=years_listed)
            days_listed = 200
        else:
            listed_date = np.NaN
            days_listed = np.NaN
    return days_listed
    
def get_rooms(value, room_type):
    """Retrieves the number of rooms specified by the type (room, bedroom, toilet, bathroom, etc.).
    If bedrooms are not specified and there is only 1 room - returns 0, if there is more than 1 room, but bedrooms not specified - returns NaN.
    Otherwise, returns the number of bedrooms"""
    try:
        value = value.lower()
    except:
        return np.NaN
    
    if room_type == 'room':
        try:
            return int(re.search(f'\d* {room_type}', value).group().strip(f' {room_type}'))
        except:
            return np.NaN
    
    if room_type == 'bedroom':
        try:
            if room_type not in value and int(re.search(f'\d* {room_type}', value).group().strip(f' {room_type}')) == 1:
                return 0
            elif room_type not in value:
                return np.NaN
            else:
                return int(re.search(f'\d* {room_type}', value).group().strip(f' {room_type}'))
        except:
            return np.NaN
    
    if room_type == 'toilet':
        try:
            if room_type not in value and int(re.search(f'\d* {room_type}', value).group().strip(f' {room_type}')) == 1:
                return 1
            elif room_type not in value:
                return np.NaN
            else:
                return int(re.search(f'\d*\s[a-z]*\s?{room_type}', value).group().strip(f' separate {room_type}'))
        except:
            return np.NaN
        
    if room_type == 'bathroom':
        try:
            if room_type not in value and int(re.search(f'\d*\s[a-z]*\s?toilet', value).group().strip(f' separate {room_type}')) == 1:
                return 1
            elif room_type not in value:
                return np.NaN
            else:
                return int(re.search(f'\d* {room_type}', value).group().strip(f' {room_type}'))
        except:
            return np.NaN
        
    
def get_bath_flag(value):
    """Takes Bathroom facilities column and create a Bath_Flag column if a bathtub / bath is available in the property.
    Returns True or False"""
    try:
        if 'bath' in value.lower():
            return True
        else:
            return False
    except:
        return np.NaN
    
    
def get_facilities(value, facility_type):
    """Take the Bathroom facilities column and facility type (toilet, shower, bath, jacuzzi, steam cabin, etc.)
    and returns the number of specified facilities"""
    try:
        value = value.lower()
        facility_type = facility_type.lower()
    except:
        return np.NaN
    
    try:
        return int(re.search(f'\d* {facility_type}', value).group().strip(f' {facility_type}'))
    except:
        if facility_type in value:
            return 1
        else:
            return 0

In [5]:
# Load the list of property advertisment dictionaries from a pickle file
with open('./Cellar/latest_ads_dataset.pkl', 'rb') as ads_list_pickle:
            ads_list = pickle.load(ads_list_pickle)

print(len(ads_list))

with open('./Cellar/new_adverts_2020414.pkl', 'rb') as new_ads_pickle:
            new_ads = pickle.load(new_ads_pickle)

print(len(new_ads))

8313
1365


### Dictionaries to Pandas without duplicates:

1. Get all the columns from dictionaries
2. Add Add "Listed_date_present", "listed_date" and "is_duplicate" columns
3. Create a pandas dataframe with the column list retrieved in step 1
4. Iterate over the ads_list and add a listed_date field (using get_listed_date function), where possible  
    i. In this step, as well add another key-value pair to identify records with a successful conversion of "Listed since" value to date format  
    ii. Since we are iterating already, just add the "is_duplicate" column as well
5. Now, iterate over the ads_list and append records to the empty dataframe (from step 2)  
    i. In the process check for duplicates and update "is_duplicate" value, if a record with the same id, address, asking price and status, has already been seen
6. As a final step, filter the dataframe to only keep **unique** records **with** a listed date (where possible)

In [80]:
# Step 1 - This is to create a pandas dataframe with the column sorting as in the dictionaries
column_list = []

for ad in ads_list:
    for feat_name in list(ad.keys()):
        if feat_name not in column_list:
            column_list.append(feat_name)      
            

# Step 2 - Add "Listed_date_present", "listed_date" and "is_duplicate" columns
column_list.append('listed_date')
column_list.append('listed_date_present')
column_list.append('is_duplicate')


# Step 3 - Initiate the dataframe with the desired columns
ads_df = pd.DataFrame(columns=column_list)


# Step 4 - Iterate over the ads_list and replace "Listed since" field with a date (using get_listed_date function), where possible 
    #i. #As part of this step, add another key-value pair to identify records with a successful conversion of "Listed since" to date
    #ii. Since we are iterating already, just add the "is_duplicate" column as well
for ad in ads_list:
    ad['listed_date'], ad['listed_date_present'] = get_listed_date(ad['Listed since'])
    ad['is_duplicate'] = False
    
    
# Step 5 - iterate over the ads_list and append unique records to the empty dataframe (from step 2) 
# i. In the process check for duplicates and update "is_duplicate" column, if a record with the same:
        # property_id, address, asking price and status, has already been seen


In [None]:
check_keys = ['property_id', 'address', 'Asking price', 'Status']

for ad in ads_list:
    if [ad.get(key) for key in check_keys] in list(ads_df[['property_id', 'address', 'Asking price', 'Status']]):
        print([ad.get(key) for key in check_keys])
        ad['is_duplicate'] = True
        ads_df = ads_df.append(ad, ignore_index=True)
    else:
        ad['is_duplicate'] = False
        ads_df = ads_df.append(ad, ignore_index=True)

In [76]:
ads_df

Unnamed: 0,property_link,property_id,title,address,price,neighbourhood,Transfer of ownership,Asking price,Asking price per m²,Listed since,Status,Acceptance,VVE (Owners Association) contribution,Construction,Type apartment,Building type,Year of construction,Surface areas and volume,Areas,Living area,Exterior space attached to the building,Volume in cubic meters,Layout,Number of rooms,Number of bath rooms,Bathroom facilities,Number of residential layers (stories),Energy,Energy label,Insulation,Heating,Hot water,CH boiler,Cadastral data,Exterior space,Location,Balcony/roof garden,VVE (Owners Association) checklist,Registration with KvK,Annual meeting,Periodic contribution,Reserve fund present,Maintenance plan,Building insurance,Specific,Type of roof,Other space inside the building,Located at,Facilities,Storage space,Shed / storage,Parking,Type of parking facilities,Quality marks,External storage space,Kind of house,Plot size,Garden,Back garden,Garden location,Facilities_Storage space,Garage,Type of garage,Capacity,Facilities_Garage,Construction period,Insulation_Garage,Accessibility,Original asking price,Service charges,Sun terrace,Provisional energy label,Front garden,Patio/atrium,Insulation_Storage space,Rental price,Deposit,Rental fees,Rental agreement,Ownership situation,Purchase combination,Side garden,Location_Exterior space,Commercial property,Office space,Consulting rooms,Auction,Price,Auction date,Auction period,Area,First rental price,scraped_date,Number of stories,Type of auction,Auction party,listed_date_present,is_duplicate


In [118]:
# test_df = pd.DataFrame(columns=column_list)

# for ad in ads_list:
#     if ad['property_id'] == 87998780:
#         if ad['property_id'] not in list(test_df['property_id']):
#             test_df = test_df.append(ad, ignore_index=True)

# test_df

In [119]:
# This is to create a pandas dataframe with the column sorting as in the dictionaries
column_list = []

for ad in ads_list:
    for feat_name in list(ad.keys()):
        if feat_name not in column_list:
            column_list.append(feat_name)

# Initiate the dataframe with the desired columns
ads_df = pd.DataFrame(columns=column_list)

for ad in ads_list:
    ads_df = ads_df.append(ad, ignore_index=True)

ads_df.drop(columns=['price'], inplace=True)
ads_df['property_id'] = ads_df['property_id'].apply(int)
ads_df['days_listed'] = ads_df.apply(lambda x: get_listed_days(x['Listed since'], x['scraped_date']), axis=1)
ads_df['address'] = ads_df['title']+', '+ads_df['address']
ads_df['Asking price'] = ads_df['Asking price'].apply(get_int)
ads_df['Asking price per m²'] = ads_df['Asking price per m²'].apply(get_int)
ads_df['VVE (Owners Association) contribution'] = ads_df['VVE (Owners Association) contribution'].apply(get_int)
ads_df['Year of construction'] = ads_df['Year of construction'].apply(get_int)
ads_df['Living area'] = ads_df['Living area'].apply(get_int)
ads_df['Exterior space attached to the building'] = ads_df['Exterior space attached to the building'].apply(get_int)
ads_df['Volume in cubic meters'] = ads_df['Volume in cubic meters'].apply(get_int)
ads_df['Rooms'] = ads_df['Number of rooms'].apply(get_rooms, room_type='room')
ads_df['Bedrooms'] = ads_df['Number of rooms'].apply(get_rooms, room_type='bedroom')
ads_df['Bathrooms'] = ads_df['Number of bath rooms'].apply(get_rooms, room_type='bathroom')
ads_df['Toilets'] = ads_df['Number of bath rooms'].apply(get_rooms, room_type='toilet')
ads_df['Has_Bathtub'] = ads_df['Bathroom facilities'].apply(get_bath_flag)
ads_df['Baths'] = ads_df['Bathroom facilities'].apply(get_facilities, facility_type='bath')
ads_df['Number of Toilets'] = ads_df['Bathroom facilities'].apply(get_facilities, facility_type='toilet')
ads_df['Showers'] = ads_df['Bathroom facilities'].apply(get_facilities, facility_type='shower')
ads_df['Energy label'] = ads_df['Energy label'].apply(get_energy_label)
ads_df['Provisional energy label'] = ads_df['Provisional energy label'].apply(get_energy_label)


ads_df.rename(columns={'Asking price': 'Asking price (€)', 'Asking price per m²': 'Asking price per m² (€)', 
                       'VVE (Owners Association) contribution': 'VVE contribution (monthly) (€)',
                      'Living area': 'Living area (m²)', 'Volume in cubic meters': 'Volume (m³)'}, inplace=True)
# ads_df

TypeError: ("unsupported operand type(s) for -: 'float' and 'relativedelta'", 'occurred at index 0')

In [None]:
ads_df.head(5)

In [5]:
conn = sqlite3.connect('./Database/ams_market_watch.db')  # You can create a new database by changing the name within the quotes
# cursor = conn.cursor() # The database will be saved in the location where your 'py' file is saved

In [8]:
# While creating the initial database, need to replace existing table to load all the records.
# Once that is set up, will use the append new records statement instead
# ads_df.to_sql('funda_ads', conn, if_exists='replace', index=False)


# For inserting only rows that do not exist in the table
new_ads_df = pd.DataFrame()
old_db_df = pd.read_sql_query("SELECT * FROM funda_ads", con=conn)

for i, row in ads_df.iterrows():
    if row['property_id'] not in old_db_df['property_id']:
        print(row['property_id'])
        current_row_df = pd.DataFrame([row])
        new_ads_df = pd.concat([current_row_df, new_ads_df])

len(new_ads_df)

NameError: name 'ads_df' is not defined

In [67]:
len(new_ads_df), len(old_)

7349

In [6]:
current_db_df = pd.read_sql_query("SELECT * FROM funda_ads", con=conn)
current_db_df[current_db_df['property_id'] == 87998780]

Unnamed: 0,property_link,property_id,title,address,neighbourhood,Transfer of ownership,Asking price (€),Asking price per m² (€),Listed since,Status,Acceptance,VVE contribution (monthly) (€),Construction,Type apartment,Building type,Year of construction,Surface areas and volume,Areas,Living area (m²),Exterior space attached to the building,Volume (m³),Layout,Number of rooms,Number of bath rooms,Bathroom facilities,Number of residential layers (stories),Energy,Energy label,Insulation,Heating,Hot water,CH boiler,Cadastral data,Exterior space,Location,Balcony/roof garden,VVE (Owners Association) checklist,Registration with KvK,Annual meeting,Periodic contribution,Reserve fund present,Maintenance plan,Building insurance,Specific,Type of roof,Other space inside the building,Located at,Facilities,Storage space,Shed / storage,Parking,Type of parking facilities,Quality marks,External storage space,Kind of house,Plot size,Garden,Back garden,Garden location,Facilities_Storage space,Garage,Type of garage,Capacity,Facilities_Garage,Construction period,Insulation_Garage,Accessibility,Original asking price,Service charges,Sun terrace,Provisional energy label,Front garden,Patio/atrium,Insulation_Storage space,Rental price,Deposit,Rental fees,Rental agreement,Ownership situation,Purchase combination,Side garden,Location_Exterior space,Commercial property,Office space,Consulting rooms,Auction,Price,Auction date,Auction period,Area,First rental price,Rooms,Bedrooms,Bathrooms,Toilets,Bathtub,Baths,Number of Toilets,Showers
29,https://www.funda.nl/en/koop/amsterdam/apparte...,87998780,Valkhof 125,"Valkhof 125, 1082 VE Amsterdam","Buitenveldert-West, Amsterdam",,349500,5216,Today,Available,Available in consultation,124,,Upstairs apartment (apartment),Resale property,1963,,,67,7,212,,3 rooms (2 bedrooms),1 bathroom and 1 separate toilet,,1 residential layer (story),,D,Double glazing,CH boiler,CH boiler,HR ketel (gas-fired combination boiler from 20...,AMSTERDAM AK 4363; Ownership situation; Long-t...,,"Alongside a quiet road, in residential distric...",,,Yes,Yes,Yes (€ 124 per month),Yes,Yes,Yes,,,,2nd level of residential structure,TV via cable,,Built-in,,Paid parking and parking permits,,6 m²,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,3.0,2.0,1.0,1.0,,,,
3644,https://www.funda.nl/en/koop/amsterdam/apparte...,87998780,Valkhof 125,"Valkhof 125, 1082 VE Amsterdam","Buitenveldert-West, Amsterdam",,349500,5216,"December 7, 2019",Available,Available in consultation,124,,Upstairs apartment (apartment),Resale property,1963,,,67,7,212,,3 rooms (2 bedrooms),1 bathroom and 1 separate toilet,,1 residential layer (story),,D,Double glazing,CH boiler,CH boiler,HR ketel (gas-fired combination boiler from 20...,AMSTERDAM AK 4363; Ownership situation; Long-t...,,"Alongside a quiet road, in residential distric...",,,Yes,Yes,Yes (€ 124 per month),Yes,Yes,Yes,,,,2nd level of residential structure,TV via cable,,Built-in,,Paid parking and parking permits,,6 m²,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,3.0,2.0,1.0,1.0,,,,


In [9]:
new_ads_df[new_ads_df['property_id'] == 87998780]

KeyError: 'property_id'

### Let's plot some stuff! :)

In [None]:
sns.scatterplot(current_db_df['Bedrooms'].dropna(), current_db_df['Asking price per m² (€)'].dropna())

<matplotlib.axes._subplots.AxesSubplot at 0x119c93c50>