In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:95% !important; }</style>"))

In [2]:
#Essentials
import numpy as np
import pandas as pd
import pickle
import re
import datetime as dt
from dateutil.relativedelta import *
import time
import seaborn as sns

#SQL related - NEED TO DECIDE WHICH ONE I'LL BE USING AND DELETE THE REST
import sqlite3
import pandas.io.sql as pd_sql
# import psycopg2
# from sqlalchemy import create_engine

pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 6000)

import warnings
warnings.filterwarnings('ignore')

In [3]:
def get_energy_label(value):
    """Takes 'Energy label' column and strips it from the words 'What does this mean?'
    Return just the label"""
    no_touch_list = ['Not required', 'Not available', np.NaN]
    if value not in no_touch_list:
        return value[0]
    else: 
        return np.NaN

def get_int(value):
    """Trims the price, area and other fields with numbers and converts them into int"""
    try:
        return re.sub('[€\sk.,m²m³v.o.n.permonthBeforeAfter]', '', value)
    except:
        return np.NaN

def string_to_date(value):
    """Whenever possible, converts text date value to date format (takes "Listen Since" field value as an argument)
    Returns the date value and a boolean True or False to identify whether it successfully converted the date or not."""
    try:
        listed_date = dt.datetime.strptime(value, '%B %d, %Y')
        listed_date = listed_date.date()
        return listed_date, True
    except:
        return np.NaN, False
    
    
def get_listed_date(value, scraped_date):
    """Converts a string in the 'Listed since' column into a listed_date value (in date format)"""
    today = dt.date.today()

    try:
        listed_date = dt.datetime.strptime(value, '%B %d, %Y').date()
#         print('Listed date converted to date format.')
    except:
        if scraped_date is np.NaN:
#             print('Scraped date is NaN.')
            listed_date = np.NaN    
        else:         
            scraped_date = scraped_date        
#             print('Scraped date is accepted.')
            if 'Today' in value:
#                 print('Today interpreted correctly.')
                listed_date = scraped_date
            elif 'week' in value:
#                 print('Weeks interpreted correctly.')
                weeks_listed = int(re.search('\d*', value).group())
                listed_date = scraped_date - relativedelta(weeks=weeks_listed)
            elif 'month' in value:
#                 print('Months interpreted correctly.')
                months_listed = int(re.search('\d*', value).group())
                listed_date = scraped_date - relativedelta(months=months_listed)
            elif '6+' in value:
#                 print('Cannot derive the date listed from "Listed since" field.')
                # Not precise enough to tell, could be 7 months, could be 2 years
                listed_date = np.NaN
            else:
#                 print('This option is not captured in get_listed_days function.')
                listed_date = np.NaN
    return listed_date
    
    
def get_clean_df(df, multiples_df, all_ids_with_listed_date):
    """Takes current dataframe (df), another dataframe that only has the records that have multiple entries and a list of all ids that have
    at least one record where listed_date is present. 
        1. It starts with creating an empty list for already handled cases
        2. Checks if the id was already handled, if so, assigned a False value to the "is_to_keep" column, otherwise, proceeds
        3. Checks if a records is in the multiples_df:
            i. If no, assigned True to "is_to_keep" and ads it to the handled_list
            ii. If yes, it proceeds
        4. Does the remaining checks and assigned True / False where appropriate
        5. Eventually, checks if that id has any record with a listed_date
            i. If no, assigned True to "is_to_keep" and ads it to the handled_list
            ii. If yes, proceeds, because that means a value with True will eventually get assigned on that id
    Returns True / False to suggest which column should be kept (True) / which columns should be dropped."""
    handled_list=[]
    updated_df = df.copy()
    updated_df['is_to_keep'] = False
    
    for index, row in updated_df.iterrows():
        if row['property_id'] not in handled_list:
            
            # There is an issue with records that are duplicates, but also have a change in price or status (e.g. id = 41420958). 
            # Using below process I eliminate them because of the search and multiples_df only by property_id 
            # and because handled_list only have property_ids logged
            
            if row['property_id'] in multiples_df['property_id'].tolist():
                if row['rank_same_record'] == 1 and row['listed_date_present'] == True:
                    handled_list.append(row['property_id'])
                    updated_df.at[index, 'is_to_keep'] = True
                elif row['listed_date_present'] == True:
                    handled_list.append(row['property_id'])
                    updated_df.at[index, 'is_to_keep'] = True
                elif row['property_id'] not in all_ids_with_listed_date and row['rank_same_record'] == 1:
                    handled_list.append(row['property_id'])
                    updated_df.at[index, 'is_to_keep'] = True
                else:
                    updated_df.at[index, 'is_to_keep'] = False
            else:
                handled_list.append(row['property_id'])
                updated_df.at[index, 'is_to_keep'] = True
        else:
            updated_df.iloc[index]['is_to_keep'] = False

    final_clean_df = updated_df[updated_df['is_to_keep'] == True]
#     final_clean_df.drop(columns=['listed_date_present', 'is_duplicate', 'rank_same_record', 'is_to_keep'], inplace=True)
    
    return final_clean_df

def get_rooms(value, room_type):
    """Retrieves the number of rooms specified by the type (room, bedroom, toilet, bathroom, etc.).
    If bedrooms are not specified and there is only 1 room - returns 0, if there is more than 1 room, but bedrooms not specified - returns NaN.
    Otherwise, returns the number of bedrooms"""
    try:
        value = value.lower()
    except:
        return np.NaN
    
    if room_type == 'room':
        try:
            return int(re.search(f'\d* {room_type}', value).group().strip(f' {room_type}'))
        except:
            return np.NaN
    
    if room_type == 'bedroom':
        try:
            if room_type not in value and int(re.search(f'\d* {room_type}', value).group().strip(f' {room_type}')) == 1:
                return 0
            elif room_type not in value:
                return np.NaN
            else:
                return int(re.search(f'\d* {room_type}', value).group().strip(f' {room_type}'))
        except:
            return np.NaN
    
    if room_type == 'toilet':
        try:
            if room_type not in value and int(re.search(f'\d* {room_type}', value).group().strip(f' {room_type}')) == 1:
                return 1
            elif room_type not in value:
                return np.NaN
            else:
                return int(re.search(f'\d*\s[a-z]*\s?{room_type}', value).group().strip(f' separate {room_type}'))
        except:
            return np.NaN
        
    if room_type == 'bathroom':
        try:
            if room_type not in value and int(re.search(f'\d*\s[a-z]*\s?toilet', value).group().strip(f' separate {room_type}')) == 1:
                return 1
            elif room_type not in value:
                return np.NaN
            else:
                return int(re.search(f'\d* {room_type}', value).group().strip(f' {room_type}'))
        except:
            return np.NaN
        
    
def get_bath_flag(value):
    """Takes Bathroom facilities column and create a Bath_Flag column if a bathtub / bath is available in the property.
    Returns True or False"""
    try:
        if 'bath' in value.lower():
            return True
        else:
            return False
    except:
        return np.NaN
    
    
def get_facilities(value, facility_type):
    """Take the Bathroom facilities column and facility type (toilet, shower, bath, jacuzzi, steam cabin, etc.)
    and returns the number of specified facilities"""
    try:
        value = value.lower()
        facility_type = facility_type.lower()
    except:
        return np.NaN
    
    try:
        return int(re.search(f'\d* {facility_type}', value).group().strip(f' {facility_type}'))
    except:
        if facility_type in value:
            return 1
        else:
            return 0

In [4]:
# Load the list of property advertisment dictionaries from a pickle file
with open('./Cellar/Archive/ads_so_far_20191215.pkl', 'rb') as ads_list_pickle:
            dec_19_ads_list = pickle.load(ads_list_pickle)

print(f'Number of records in 2019 December 15th ads list is: {len(dec_19_ads_list)}.')

with open('./Cellar/Archive/ads_so_far_202029.pkl', 'rb') as ads_list_pickle:
            feb_20_ads_list = pickle.load(ads_list_pickle)

print(f'Number of records in 2020 February 9th ads list is: {len(feb_20_ads_list)}.')

with open('./Cellar/latest_ads_dataset.pkl', 'rb') as ads_list_pickle:
            ads_list = pickle.load(ads_list_pickle)

print(f'Number of records in the latest ads list is: {len(ads_list)}.')

with open('./Cellar/Archive/new_adverts_2020414.pkl', 'rb') as new_ads_pickle:
            new_ads = pickle.load(new_ads_pickle)

print(f'Number of records in the new adverts ads list is: {len(new_ads)}.')

Number of records in 2019 December 15th ads list is: 3860.
Number of records in 2020 February 9th ads list is: 5398.
Number of records in the latest ads list is: 8313.
Number of records in the new adverts ads list is: 1365.


### Dictionaries to Pandas without duplicates:

1. Create a small dictionary with just property_id, title, "Listed since" and "Asking price" for both 2019 Dec and 2020 Feb lists  
    i. This will allow to get the scraped dates and increase the number of records with appropriate **listed_date** (*this is a one-off process and will not have to be repeated going forward*). 
2. Get all the columns from dictionaries
3. Add Add "Listed_date_present", "listed_date" and "is_duplicate" columns
4. Create a pandas dataframe with the column list retrieved in step 1
5. Iterate over the ads_list and add a listed_date field (using get_listed_date function), where possible  
    i. In this step, as well add another key-value pair to identify records with a successful conversion of "Listed since" value to date format  
    ii. Since we are iterating already, just add the "is_duplicate" column as well
6. Now, iterate over the ads_list and append records to the empty dataframe (from step 2)  
    i. Create a check_ads_list with only columns used to check for duplicates   
    ii. In the process check for duplicates and update "is_duplicate" value, if a record with the same id, title, asking price and status, has already been seen
7. As a final step, filter the dataframe to only keep **unique** records **with** a listed date (where possible) 
8. Now, we are ready to convert columns to the right formats, clean up text values and make them numbers, etc.



* Additional challenge: if there are duplicate records and one of them has no scraped_date and has "Listed since" = Today, while the other(s) has an actual listed_date, we can update scraped_date field of the record with no scraped date

In [5]:
# Step 1 - eate a small dictionary with just property_id, title, "Listed since" and "Asking price" for both 2019 Dec and 2020 Feb lists
scrape_date_check = ['property_id', 'title', 'Listed since', 'Asking price']
my_order = [0, 1, 3, 2]

dec19_check_list = []

# Iterate over the 2019 Dec ads list
for ad in dec_19_ads_list:
    # Create a new_ad dictionary for each ad
    new_ad = []
    # Iterate over the keys in each ad and only append those that appear in the scrape_date_check list
    for key in ad.keys():
        if key in scrape_date_check:
            new_ad.append(ad[key])
    if len(new_ad) < 4:
        new_ad_ordered = new_ad.copy()
    else:
        new_ad_ordered = [new_ad[i] for i in my_order]
    dec19_check_list.append(new_ad_ordered)
    
    
    
# Repeat the same for 2020 Feb
feb20_check_list = []            

# Iterate over the 2020 Feb ads list
for ad in feb_20_ads_list:
    # Create a new_ad dictionary for each ad
    new_ad = []
    # Iterate over the keys in each ad and only append those that appear in the scrape_date_check list
    for key in ad.keys():
        if key in scrape_date_check:
            new_ad.append(ad[key])
    if len(new_ad) < 4:
        new_ad_ordered = new_ad.copy()
    else:
        new_ad_ordered = [new_ad[i] for i in my_order]
    feb20_check_list.append(new_ad_ordered)



# i. Assign a scraped date where possible
for ad in ads_list:
    if [ad.get(key) for key in scrape_date_check] in dec19_check_list:
        ad['scraped_date'] = dt.date(2019, 12, 15)
    elif [ad.get(key) for key in scrape_date_check] in feb20_check_list:
        ad['scraped_date'] = dt.date(2020, 2, 9)
    else:
        pass

In [6]:
# Step 2 - This is to create a pandas dataframe with the column sorting as in the dictionaries
column_list = []

for ad in ads_list:
    for feat_name in list(ad.keys()):
        if feat_name not in column_list:
            column_list.append(feat_name)      
            

# Step 3 - Add "Listed_date_present", "listed_date" and "is_duplicate" columns
column_list.append('listed_date')
column_list.append('listed_date_present')
column_list.append('is_duplicate')
column_list.append('rank_same_record')


# Step 4 - Initiate the dataframe with the desired columns
ads_df = pd.DataFrame(columns=column_list)


# Step 5 - Iterate over the ads_list and replace "Listed since" field with a date (using get_listed_date function), where possible 
    #i. #As part of this step, add another key-value pair to identify records with a successful conversion of "Listed since" to date
    #ii. Since we are iterating already, just add the "is_duplicate" column as well
for ad in ads_list:
    ad['listed_date'], ad['listed_date_present'] = string_to_date(ad['Listed since'])
    ad['is_duplicate'] = False
    
    
# Step 6 - iterate over the ads_list and append unique records to the empty dataframe (from step 2) 
    # i. Create a check_ads_list with only columns used to check for duplicates
    # ii. In the process check for duplicates and update "is_duplicate" value, if a record with the same:
        # property_id, title, asking price and status, has already been seen
check_keys = ['property_id', 'title', 'Asking price', 'Status']
ads_seen_list = []

for ad in ads_list:
    # Keeps track of all the ads seen so far
    ad_check_value = [ad.get(key) for key in check_keys]
    ads_seen_list.append(ad_check_value)
    
    # Assigned a rank_same_record value (if 0 then this is the first record)
    ad['rank_same_record'] = ads_seen_list.count(ad_check_value)
    
    # Checks if the ad is in the ads_df already and updates is_duplicate and rank_same_record columns
    if ad_check_value in ads_df[['property_id', 'title', 'Asking price', 'Status']].values.tolist():
        ad['is_duplicate'] = True
        ads_df = ads_df.append(ad, ignore_index=True)
    else:
        ad['is_duplicate'] = False
        ads_df = ads_df.append(ad, ignore_index=True)
        
print(f'Original ads dataframe had {len(ads_df)} non-unique records.')
        
        
# Step 7 - filter the dataframe to only keep unique records with a listed date (where possible)
# Define a dataframe with all the records that show up in the dataset multiple times
multiples_df = ads_df.groupby(['property_id', 'title', 'Asking price', 'Status'])['property_link'].count().reset_index()
multiples_df.rename(columns={'property_link': 'times_in_df'}, inplace=True)
multiples_df = multiples_df[multiples_df['times_in_df'] > 1]
print(f'Multiples df had the total of {len(multiples_df)} unique property ids.')

# Create a dataframe with all the records that have a listed_date 
all_ids_with_listed_date = ads_df[(ads_df['listed_date_present'] == True)].groupby('property_id')['listed_date_present'].nunique().reset_index()['property_id'].tolist()
print(f'All ads with listed date list had the total of {len(all_ids_with_listed_date)} unique property ids.')

clean_ads_df = get_clean_df(ads_df, multiples_df, all_ids_with_listed_date)
print(f'Clean ads dataframe has {len(clean_ads_df)} unique records. Total number of records removed is {len(ads_df) - len(clean_ads_df)}.')

Original ads dataframe had 8313 non-unique records.
Multiples df had the total of 1164 unique property ids.
All ads with listed date list had the total of 1501 unique property ids.
Clean ads dataframe has 6191 unique records. Total number of records removed is 2122.


In [31]:
len(ads_df[ads_df.duplicated(subset=['property_id', 'address', 'Asking price', 'Status'], keep=False) == True])

2648

In [46]:
# This little step is to find all the records that were not duplicates when price was included, but became such, 
# when price was removed from the list of columns. The same can be done for Status (by removing Status from the "3field_df" list of columns)

duplicates_4field_df = ads_df[ads_df.duplicated(subset=['property_id', 'address', 'Asking price', 'Status'], keep=False) == True]
duplicates_3field_df = ads_df[ads_df.duplicated(subset=['property_id', 'address', 'Status'], keep=False) == True]

for i, row in duplicates_3field_df.iterrows():
    if row.values.tolist() not in duplicates_4field_df.values.tolist():
        print(row['property_id'])

41420958
87981094
41537182
40835885
41447928
87886094
87886068
40800866
40800865
86645977
86565599
40927939
40322260
40622618
40077067
40077066
41466396
41578665
87919760
87819191
41527387
41556281
41512424
41565611
41445898
40310093
87990160
41476858
41534916
87991079
87933045
40336586
40336585
87882512
41400085
87862792
87995361
40077088
40077072
40077071
86211822
40341605
87983016
41547405
41547405
41688119
41534916
41537182
41447928
87091094
41621391
41608091
87800752
41687848
87090290
40322260
87093955
41466396
87037550
87051381
41527387
41556281
41681730
41687818
41512424
41565611
41698748
87990160
87991079
41521343
41688119
41546922
41559069
87091094
41621391
41439093
41577751
41608091
41535771
87800752
41687848
87090290
87093955
41576510
41424206
41420075
87051381
41558885
40927939
41698748
41401579
41681730
41687818
41434947
86648748
87919760
87926070
87882512
41400085
87862792
40077088
40077072
40077071
86211822
40341605
87983016


In [45]:
clean_ads_df[clean_ads_df['property_id'] == 41420958]       #40051099, 87804888, 87804860, 87998136, 87998780

Unnamed: 0,property_link,property_id,title,address,price,neighbourhood,Transfer of ownership,Asking price,Asking price per m²,Listed since,Status,Acceptance,VVE (Owners Association) contribution,Construction,Type apartment,Building type,Year of construction,Surface areas and volume,Areas,Living area,Exterior space attached to the building,Volume in cubic meters,Layout,Number of rooms,Number of bath rooms,Bathroom facilities,Number of residential layers (stories),Energy,Energy label,Insulation,Heating,Hot water,CH boiler,Cadastral data,Exterior space,Location,Balcony/roof garden,VVE (Owners Association) checklist,Registration with KvK,Annual meeting,Periodic contribution,Reserve fund present,Maintenance plan,Building insurance,scraped_date,Specific,Type of roof,Other space inside the building,Located at,Facilities,...,Shed / storage,Parking,Type of parking facilities,Quality marks,External storage space,Kind of house,Plot size,Garden,Back garden,Garden location,Facilities_Storage space,Garage,Type of garage,Capacity,Facilities_Garage,Construction period,Insulation_Garage,Accessibility,Original asking price,Service charges,Sun terrace,Provisional energy label,Front garden,Patio/atrium,Insulation_Storage space,Rental price,Deposit,Rental fees,Rental agreement,Ownership situation,Purchase combination,Side garden,Location_Exterior space,Commercial property,Office space,Consulting rooms,Auction,Price,Auction date,Auction period,Area,First rental price,Number of stories,Type of auction,Auction party,listed_date,listed_date_present,is_duplicate,rank_same_record,is_to_keep
113,https://www.funda.nl/en/koop/amsterdam/huis-41...,41420958,Amstelveenseweg 1052,1081 JV Amsterdam,"€ 1,495,000 k.k.","Buitenveldert-West, Amsterdam",,"€ 1,495,000 k.k.","€ 6,028",8 weeks,Available,Available in consultation,,,,Resale property,1939,,,248 m²,22 m²,795 m³,,8 rooms (7 bedrooms),2 bathrooms and 1 separate toilet,"Jacuzzi, 2 showers and 2 toilets",3 residential layers (stories) and a basement,,,Roof insulation and double glazing,CH boiler,CH boiler,"2010, in ownership",AMSTERDAM AK 743; Area; 214 m²; Ownership situ...,,"On the edge of a forest, alongside busy road a...",Balcony present,,,,,,,,2019-12-15,,Combination roof covered with roof tiles and a...,,,"Mechanical ventilation, TV via cable and glass...",...,Built-in,,Paid parking and parking permits,,,"Desirable residence/villa, row house",214 m²,Back garden,126 m² (21m deep and 6m broad),Located at the west,Electricity,,,,,,,,"€ 1,595,000 k.k.",,,F What does this mean?,,,No insulation,,,,,,,,,,,,,,,,,,,,,,False,False,1,True


In [155]:
# Duplicate values: 87804888, 87804860, 87998136

In [15]:
# test_df = ads_df[ads_df['property_id'] == 40051099]

# test_df['listed_date'] = test_df.apply(lambda x: get_listed_date(x['Listed since'], x['scraped_date']), axis=1)
# test_df

In [38]:
# Step 8 - Now, we are ready to convert columns to the right formats, clean up text values and make them numbers, etc.

clean_ads_df.drop(columns=['price'], inplace=True)
clean_ads_df['property_id'] = clean_ads_df['property_id'].apply(int)
clean_ads_df['listed_date'] = clean_ads_df.apply(lambda x: get_listed_date(x['Listed since'], x['scraped_date']), axis=1)
clean_ads_df['address'] = clean_ads_df['title']+', '+clean_ads_df['address']
clean_ads_df['Asking price'] = clean_ads_df['Asking price'].apply(get_int)
clean_ads_df['Asking price per m²'] = clean_ads_df['Asking price per m²'].apply(get_int)
clean_ads_df['VVE (Owners Association) contribution'] = clean_ads_df['VVE (Owners Association) contribution'].apply(get_int)
clean_ads_df['Year of construction'] = clean_ads_df['Year of construction'].apply(get_int)
clean_ads_df['Living area'] = clean_ads_df['Living area'].apply(get_int)
clean_ads_df['Exterior space attached to the building'] = clean_ads_df['Exterior space attached to the building'].apply(get_int)
clean_ads_df['Volume in cubic meters'] = clean_ads_df['Volume in cubic meters'].apply(get_int)
clean_ads_df['Rooms'] = clean_ads_df['Number of rooms'].apply(get_rooms, room_type='room')
clean_ads_df['Bedrooms'] = clean_ads_df['Number of rooms'].apply(get_rooms, room_type='bedroom')
clean_ads_df['Bathrooms'] = clean_ads_df['Number of bath rooms'].apply(get_rooms, room_type='bathroom')
clean_ads_df['Toilets'] = clean_ads_df['Number of bath rooms'].apply(get_rooms, room_type='toilet')
clean_ads_df['Has_Bathtub'] = clean_ads_df['Bathroom facilities'].apply(get_bath_flag)
clean_ads_df['Baths'] = clean_ads_df['Bathroom facilities'].apply(get_facilities, facility_type='bath')
clean_ads_df['Number of Toilets'] = clean_ads_df['Bathroom facilities'].apply(get_facilities, facility_type='toilet')
clean_ads_df['Showers'] = clean_ads_df['Bathroom facilities'].apply(get_facilities, facility_type='shower')
clean_ads_df['Energy label'] = clean_ads_df['Energy label'].apply(get_energy_label)
clean_ads_df['Provisional energy label'] = clean_ads_df['Provisional energy label'].apply(get_energy_label)


clean_ads_df.rename(columns={'Asking price': 'Asking price (€)', 'Asking price per m²': 'Asking price per m² (€)', 
                       'VVE (Owners Association) contribution': 'VVE contribution (monthly) (€)',
                      'Living area': 'Living area (m²)', 'Volume in cubic meters': 'Volume (m³)'}, inplace=True)

In [None]:
ads_df.head(5)

In [5]:
conn = sqlite3.connect('./Database/ams_market_watch.db')  # You can create a new database by changing the name within the quotes
# cursor = conn.cursor() # The database will be saved in the location where your 'py' file is saved

In [8]:
# While creating the initial database, need to replace existing table to load all the records.
# Once that is set up, will use the append new records statement instead
# ads_df.to_sql('funda_ads', conn, if_exists='replace', index=False)


# For inserting only rows that do not exist in the table
new_ads_df = pd.DataFrame()
old_db_df = pd.read_sql_query("SELECT * FROM funda_ads", con=conn)

for i, row in ads_df.iterrows():
    if row['property_id'] not in old_db_df['property_id']:
        print(row['property_id'])
        current_row_df = pd.DataFrame([row])
        new_ads_df = pd.concat([current_row_df, new_ads_df])

len(new_ads_df)

NameError: name 'ads_df' is not defined

In [67]:
len(new_ads_df), len(old_)

7349

In [6]:
current_db_df = pd.read_sql_query("SELECT * FROM funda_ads", con=conn)
current_db_df[current_db_df['property_id'] == 87998780]

Unnamed: 0,property_link,property_id,title,address,neighbourhood,Transfer of ownership,Asking price (€),Asking price per m² (€),Listed since,Status,Acceptance,VVE contribution (monthly) (€),Construction,Type apartment,Building type,Year of construction,Surface areas and volume,Areas,Living area (m²),Exterior space attached to the building,Volume (m³),Layout,Number of rooms,Number of bath rooms,Bathroom facilities,Number of residential layers (stories),Energy,Energy label,Insulation,Heating,Hot water,CH boiler,Cadastral data,Exterior space,Location,Balcony/roof garden,VVE (Owners Association) checklist,Registration with KvK,Annual meeting,Periodic contribution,Reserve fund present,Maintenance plan,Building insurance,Specific,Type of roof,Other space inside the building,Located at,Facilities,Storage space,Shed / storage,Parking,Type of parking facilities,Quality marks,External storage space,Kind of house,Plot size,Garden,Back garden,Garden location,Facilities_Storage space,Garage,Type of garage,Capacity,Facilities_Garage,Construction period,Insulation_Garage,Accessibility,Original asking price,Service charges,Sun terrace,Provisional energy label,Front garden,Patio/atrium,Insulation_Storage space,Rental price,Deposit,Rental fees,Rental agreement,Ownership situation,Purchase combination,Side garden,Location_Exterior space,Commercial property,Office space,Consulting rooms,Auction,Price,Auction date,Auction period,Area,First rental price,Rooms,Bedrooms,Bathrooms,Toilets,Bathtub,Baths,Number of Toilets,Showers
29,https://www.funda.nl/en/koop/amsterdam/apparte...,87998780,Valkhof 125,"Valkhof 125, 1082 VE Amsterdam","Buitenveldert-West, Amsterdam",,349500,5216,Today,Available,Available in consultation,124,,Upstairs apartment (apartment),Resale property,1963,,,67,7,212,,3 rooms (2 bedrooms),1 bathroom and 1 separate toilet,,1 residential layer (story),,D,Double glazing,CH boiler,CH boiler,HR ketel (gas-fired combination boiler from 20...,AMSTERDAM AK 4363; Ownership situation; Long-t...,,"Alongside a quiet road, in residential distric...",,,Yes,Yes,Yes (€ 124 per month),Yes,Yes,Yes,,,,2nd level of residential structure,TV via cable,,Built-in,,Paid parking and parking permits,,6 m²,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,3.0,2.0,1.0,1.0,,,,
3644,https://www.funda.nl/en/koop/amsterdam/apparte...,87998780,Valkhof 125,"Valkhof 125, 1082 VE Amsterdam","Buitenveldert-West, Amsterdam",,349500,5216,"December 7, 2019",Available,Available in consultation,124,,Upstairs apartment (apartment),Resale property,1963,,,67,7,212,,3 rooms (2 bedrooms),1 bathroom and 1 separate toilet,,1 residential layer (story),,D,Double glazing,CH boiler,CH boiler,HR ketel (gas-fired combination boiler from 20...,AMSTERDAM AK 4363; Ownership situation; Long-t...,,"Alongside a quiet road, in residential distric...",,,Yes,Yes,Yes (€ 124 per month),Yes,Yes,Yes,,,,2nd level of residential structure,TV via cable,,Built-in,,Paid parking and parking permits,,6 m²,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,3.0,2.0,1.0,1.0,,,,


In [9]:
new_ads_df[new_ads_df['property_id'] == 87998780]

KeyError: 'property_id'

### Let's plot some stuff! :)

In [None]:
sns.scatterplot(current_db_df['Bedrooms'].dropna(), current_db_df['Asking price per m² (€)'].dropna())

<matplotlib.axes._subplots.AxesSubplot at 0x119c93c50>