In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:95% !important; }</style>"))

In [2]:
#Essentials
import numpy as np
import pandas as pd
import pickle
import re
import datetime as dt
from dateutil.relativedelta import *
import time

#SQL related - NEED TO DECIDE WHICH ONE I'LL BE USING AND DELETE THE REST
import sqlite3
import pandas.io.sql as pd_sql
# import psycopg2
# from sqlalchemy import create_engine

pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 6000)

In [3]:
def get_energy_label(value):
    """Takes 'Energy label' column and strips it from the words 'What does this mean?'
    Return just the label"""
    no_touch_list = ['Not required', 'Not available', np.NaN]
    if value not in no_touch_list:
        return value[0]
    else: 
        return np.NaN

def get_int(value):
    """Trims the price, area and other fields with numbers and converts them into int"""
    try:
        return re.sub('[€\sk.,m²m³v.o.n.permonthBeforeAfter]', '', value)
    except:
        return np.NaN

def get_listed_date(value, scraped_date):
    """Converts a string into a date"""
    try:
        listed_date = dt.datetime.strptime(value, '%B %d, %Y')
    except:
        if 'Today' in value:
            listed_days = 1
        elif 'week' in value:
            weeks_listed = int(re.search('\d*', value).group())
            listed_date = scraped_date - relativedelta(weeks=weeks_listed)
            days_listed = (scraped_date - listed_date).days
        elif 'month' in value:
            months_listed = int(re.search('\d*', value).group())
            listed_date = scraped_date - relativedelta(months=months_listed)
            days_listed = (scraped_date - listed_date).days
        elif '6+' in value:
            listed_date = scraped_date - relativedelta(years=years_listed)
            days_listed = 200
        else:
            listed_date = np.NaN
    return listed_date
    
def get_rooms(value, room_type):
    """Retrieves the number of rooms specified by the type (room, bedroom, toilet, bathroom, etc.).
    If bedrooms are not specified and there is only 1 room - returns 0, if there is more than 1 room, but bedrooms not specified - returns NaN.
    Otherwise, returns the number of bedrooms"""
    try:
        value = value.lower()
    except:
        return np.NaN
    
    if room_type == 'room':
        try:
            return int(re.search(f'\d* {room_type}', value).group().strip(f' {room_type}'))
        except:
            return np.NaN
    
    if room_type == 'bedroom':
        try:
            if room_type not in value and int(re.search(f'\d* {room_type}', value).group().strip(f' {room_type}')) == 1:
                return 0
            elif room_type not in value:
                return np.NaN
            else:
                return int(re.search(f'\d* {room_type}', value).group().strip(f' {room_type}'))
        except:
            return np.NaN
    
    if room_type == 'toilet':
        try:
            if room_type not in value and int(re.search(f'\d* {room_type}', value).group().strip(f' {room_type}')) == 1:
                return 1
            elif room_type not in value:
                return np.NaN
            else:
                return int(re.search(f'\d*\s[a-z]*\s?{room_type}', value).group().strip(f' separate {room_type}'))
        except:
            return np.NaN
        
    if room_type == 'bathroom':
        try:
            if room_type not in value and int(re.search(f'\d*\s[a-z]*\s?toilet', value).group().strip(f' separate {room_type}')) == 1:
                return 1
            elif room_type not in value:
                return np.NaN
            else:
                return int(re.search(f'\d* {room_type}', value).group().strip(f' {room_type}'))
        except:
            return np.NaN
        
    
def get_bath_flag(value):
    """Takes Bathroom facilities column and create a Bath_Flag column if a bathtub / bath is available in the property.
    Returns True or False"""
    try:
        if 'bath' in value.lower():
            return True
        else:
            return False
    except:
        return np.NaN
    
def get_facilities(value, facility_type):
    """Take the Bathroom facilities column and facility type (toilet, shower, bath, jacuzzi, steam cabin, etc.)
    and returns the number of specified facilities"""
    try:
        value = value.lower()
        facility_type = facility_type.lower()
    except:
        return np.NaN
    
    try:
        return int(re.search(f'\d* {facility_type}', value).group().strip(f' {facility_type}'))
    except:
        if facility_type in value:
            return 1
        else:
            return 0

In [4]:
# Load the list of property advertisment dictionaries from a pickle file
with open('./Cellar/ads_so_far_2020328.pkl', 'rb') as ads_list_pickle:
            ads_list = pickle.load(ads_list_pickle)

len(ads_list)

5398

In [5]:
# This is to create a pandas dataframe with the column sorting as in the dictionaries
column_list = []

for ad in ads_list:
    for feat_name in list(ad.keys()):
        if feat_name not in column_list:
            column_list.append(feat_name)

# Initiate the dataframe with the desired columns
ads_df = pd.DataFrame(columns=column_list)

for ad in ads_list:
    ads_df = ads_df.append(ad, ignore_index=True)

ads_df.drop(columns=['price'], inplace=True)
ads_df['property_id'] = ads_df['property_id'].apply(int)
ads_df['address'] = ads_df['title']+', '+ads_df['address']
ads_df['Asking price'] = ads_df['Asking price'].apply(get_int)
ads_df['Asking price per m²'] = ads_df['Asking price per m²'].apply(get_int)
ads_df['VVE (Owners Association) contribution'] = ads_df['VVE (Owners Association) contribution'].apply(get_int)
ads_df['Year of construction'] = ads_df['Year of construction'].apply(get_int)
ads_df['Living area'] = ads_df['Living area'].apply(get_int)
ads_df['Exterior space attached to the building'] = ads_df['Exterior space attached to the building'].apply(get_int)
ads_df['Volume in cubic meters'] = ads_df['Volume in cubic meters'].apply(get_int)
ads_df['Rooms'] = ads_df['Number of rooms'].apply(get_rooms, room_type='room')
ads_df['Bedrooms'] = ads_df['Number of rooms'].apply(get_rooms, room_type='bedroom')
ads_df['Bathrooms'] = ads_df['Number of bath rooms'].apply(get_rooms, room_type='bathroom')
ads_df['Toilets'] = ads_df['Number of bath rooms'].apply(get_rooms, room_type='toilet')
ads_df['Bathtub'] = ads_df['Bathroom facilities'].apply(get_bath_flag)
ads_df['Baths'] = ads_df['Bathroom facilities'].apply(get_facilities, facility_type='bath')
ads_df['Number of Toilets'] = ads_df['Bathroom facilities'].apply(get_facilities, facility_type='toilet')
ads_df['Showers'] = ads_df['Bathroom facilities'].apply(get_facilities, facility_type='shower')
ads_df['Energy label'] = ads_df['Energy label'].apply(get_energy_label)
ads_df['Provisional energy label'] = ads_df['Provisional energy label'].apply(get_energy_label)


ads_df.rename(columns={'Asking price': 'Asking price (€)', 'Asking price per m²': 'Asking price per m² (€)', 
                       'VVE (Owners Association) contribution': 'VVE contribution (monthly) (€)',
                      'Living area': 'Living area (m²)', 'Volume in cubic meters': 'Volume (m³)'}, inplace=True)
# ads_df

In [6]:
# ads_df.groupby('Number of rooms')['title'].count()
ads_df.head(1)

Unnamed: 0,property_link,property_id,title,address,neighbourhood,Transfer of ownership,Asking price (€),Asking price per m² (€),Listed since,Status,Acceptance,VVE contribution (monthly) (€),Construction,Type apartment,Building type,Year of construction,Surface areas and volume,Areas,Living area (m²),Exterior space attached to the building,Volume (m³),Layout,Number of rooms,Number of bath rooms,Bathroom facilities,Number of residential layers (stories),Energy,Energy label,Insulation,Heating,Hot water,CH boiler,Cadastral data,Exterior space,Location,Balcony/roof garden,VVE (Owners Association) checklist,Registration with KvK,Annual meeting,Periodic contribution,Reserve fund present,Maintenance plan,Building insurance,Specific,Type of roof,Other space inside the building,Located at,Facilities,Storage space,Shed / storage,Parking,Type of parking facilities,Quality marks,External storage space,Kind of house,Plot size,Garden,Back garden,Garden location,Facilities_Storage space,Garage,Type of garage,Capacity,Facilities_Garage,Construction period,Insulation_Garage,Accessibility,Original asking price,Service charges,Sun terrace,Provisional energy label,Front garden,Patio/atrium,Insulation_Storage space,Rental price,Deposit,Rental fees,Rental agreement,Ownership situation,Purchase combination,Side garden,Location_Exterior space,Commercial property,Office space,Consulting rooms,Auction,Price,Auction date,Auction period,Area,First rental price,Rooms,Bedrooms,Bathrooms,Toilets,Bathtub,Baths,Number of Toilets,Showers
0,https://www.funda.nl/en/koop/amsterdam/apparte...,41580542,Jacob van Lennepkade 2 1/2,"Jacob van Lennepkade 2 1/2, 1053 MJ Amsterdam","Van Lennepbuurt, Amsterdam",,1000000,6452,2 weeks,Available,Available in consultation,200,,Upstairs apartment (double upstairs apartment),Resale property,1906,,,155,12,537,,6 rooms (4 bedrooms),1 bathroom and 1 separate toilet,"Bath, shower and toilet",2 residential layers (stories),,E,Partly double glazed,CH boiler,CH boiler,Combination boiler,AMSTERDAM Q 8745; Ownership situation; Full ow...,,"Alongside water, in residential district and u...",Balcony present,,Yes,Yes,Yes (€ 200 per month),Yes,Yes,Yes,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,6.0,4.0,1.0,1.0,True,1.0,1.0,1.0


In [None]:
conn = sqlite3.connect('./Database/ams_market_watch.db')  # You can create a new database by changing the name within the quotes
# cursor = conn.cursor() # The database will be saved in the location where your 'py' file is saved

In [None]:
# While creating the initial database, need to replace existing table to load all the records.
# Once that is set up, will use the append new records statement instead
ads_df.to_sql('funda_ads', conn, if_exists='replace', index=False)


insert_query = """"""

  dtype=dtype)


ValueError: Table 'funda_ads' already exists.

In [9]:
current_db_df = pd.read_sql_query("SELECT * FROM funda_ads", con=conn)
current_db_df.head(1)