In [41]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:95% !important; }</style>"))

In [42]:
#Essentials
import numpy as np
import pandas as pd
import pickle
import re
import datetime as dt
from dateutil.relativedelta import *
import time
import seaborn as sns

#SQL related - NEED TO DECIDE WHICH ONE I'LL BE USING AND DELETE THE REST
import sqlite3
import pandas.io.sql as pd_sql
# import psycopg2
# from sqlalchemy import create_engine

pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 6000)

import warnings
warnings.filterwarnings('ignore')

In [43]:
def get_energy_label(value):
    """Takes 'Energy label' column and strips it from the words 'What does this mean?'
    Return just the label"""
    no_touch_list = ['Not required', 'Not available', np.NaN]
    if value not in no_touch_list:
        return value[0]
    else: 
        return np.NaN

def get_int(value):
    """Trims the price, area and other fields with numbers and converts them into int"""
    try:
        return re.sub('[€\sk.,m²m³v.o.n.permonthBeforeAfter]', '', value)
    except:
        return np.NaN

def string_to_date(value):
    """Whenever possible, converts text date value to date format (takes "Listen Since" field value as an argument)
    Returns the date value and a boolean True or False to identify whether it successfully converted the date or not."""
    try:
        listed_date = dt.datetime.strptime(value, '%B %d, %Y')
        listed_date = listed_date.date()
        return listed_date, True
    except:
        return np.NaN, False
    
    
def get_listed_date(value, scraped_date):
    """Converts a string in the 'Listed since' column into a listed_date value (in date format)"""
    today = dt.date.today()

    try:
        listed_date = dt.datetime.strptime(value, '%B %d, %Y').date()
    except:
        if scraped_date is np.NaN:
            listed_date = np.NaN    
        else:         
            if 'Today' in value:
                listed_date = scraped_date
            elif 'week' in value:
                weeks_listed = int(re.search('\d*', value).group())
                listed_date = scraped_date - relativedelta(weeks=weeks_listed)
            elif 'month' in value:
                months_listed = int(re.search('\d*', value).group())
                listed_date = scraped_date - relativedelta(months=months_listed)
            elif '6+' in value:
                # Not precise enough to tell, could be 7 months, could be 2 years
                listed_date = np.NaN
            else:
                listed_date = np.NaN
    return listed_date
    
    
def get_clean_df(df, all_ids_with_listed_date):
    """Takes current dataframe (df) that needs to be cleaned up and a list of all ids that have at least one record where 
       listed_date is present. 
            This is a step-by-step guide of what this function does:
                1. Copies supplied dataframe
                2. Adds a column called "is_to_keep" which will be used to filter out unnecessary rows
                3. Iterates over the dataframe
                    i. Checks if a record had a price or status change, if so, it is kept (and we record the property id in a list for later)
                    ii. Checks if a record has complete duplicates in the dataset, if not, the record is kept
                    iii. Otherwise, moves on to check if:
                                    1. If it has a listed date column filled in
                                    2. If a record for the same property id has not been kept yet (not in kept_ids_list)
                                        If both are True, then the record will be kept
                    iv. Otherwise, moves on to check if:
                                    1. If the property id does not exist in the all_ids_with_listed_date list
                                    2. If a record for the same property id has not been kept yet (not in kept_ids_list)
                                        If both are True, then the record will be kept
        For all records that did not have a price or status change, the is_duplicate column is changed to False (since all duplicates will be dropped)
        Eventually columns no longer needed are dropped (listed_date_present, is_complete_duplicate and is_to_keep).                                
        Returns a new dataframe with only relevant columns (all complete duplicates removed)."""
    
    clean_df = df.copy()
    kept_ids_list = []
    clean_df['is_to_keep'] = False
    
    price_status_change_list = []
    
    for index, row in clean_df.iterrows():
        if row['is_price_change'] == True or row['is_status_change'] == True:
            clean_df.at[index, 'is_to_keep'] = True
            price_status_change_list.append(row['property_id'])
        elif row['is_complete_duplicate'] == False:
            clean_df.at[index, 'is_to_keep'] = True
            kept_ids_list.append(row['property_id'])
        elif row['listed_date_present'] == True and row['property_id'] not in kept_ids_list:
            clean_df.at[index, 'is_to_keep'] = True
            kept_ids_list.append(row['property_id'])
        elif row['property_id'] not in all_ids_with_listed_date and row['property_id'] not in kept_ids_list:
            clean_df.at[index, 'is_to_keep'] = True
            kept_ids_list.append(row['property_id'])
    
    # Reset the duplicate column to only say True if price or status change records remain in the dataset
    for index, row in clean_df.iterrows():
        if row['property_id'] not in price_status_change_list:
            clean_df.at[index, 'is_duplicate'] = False
            
    final_clean_df = clean_df[clean_df['is_to_keep'] == True]
    final_clean_df.drop(columns=['listed_date_present', 'is_complete_duplicate', 'is_to_keep'], inplace=True)    
    return final_clean_df

def get_rooms(value, room_type):
    """Retrieves the number of rooms specified by the type (room, bedroom, toilet, bathroom, etc.).
    If bedrooms are not specified and there is only 1 room - returns 0, if there is more than 1 room, but bedrooms not specified - returns NaN.
    Otherwise, returns the number of bedrooms"""
    try:
        value = value.lower()
    except:
        return np.NaN
    
    if room_type == 'room':
        try:
            return int(re.search(f'\d* {room_type}', value).group().strip(f' {room_type}'))
        except:
            return np.NaN
    
    if room_type == 'bedroom':
        try:
            if room_type not in value and int(re.search(f'\d* {room_type}', value).group().strip(f' {room_type}')) == 1:
                return 0
            elif room_type not in value:
                return np.NaN
            else:
                return int(re.search(f'\d* {room_type}', value).group().strip(f' {room_type}'))
        except:
            return np.NaN
    
    if room_type == 'toilet':
        try:
            if room_type not in value and int(re.search(f'\d* {room_type}', value).group().strip(f' {room_type}')) == 1:
                return 1
            elif room_type not in value:
                return np.NaN
            else:
                return int(re.search(f'\d*\s[a-z]*\s?{room_type}', value).group().strip(f' separate {room_type}'))
        except:
            return np.NaN
        
    if room_type == 'bathroom':
        try:
            if room_type not in value and int(re.search(f'\d*\s[a-z]*\s?toilet', value).group().strip(f' separate {room_type}')) == 1:
                return 1
            elif room_type not in value:
                return np.NaN
            else:
                return int(re.search(f'\d* {room_type}', value).group().strip(f' {room_type}'))
        except:
            return np.NaN
        
    
def get_bath_flag(value):
    """Takes Bathroom facilities column and create a Bath_Flag column if a bathtub / bath is available in the property.
    Returns True or False"""
    try:
        if 'bath' in value.lower():
            return True
        else:
            return False
    except:
        return np.NaN
    
    
def get_facilities(value, facility_type):
    """Take the Bathroom facilities column and facility type (toilet, shower, bath, jacuzzi, steam cabin, etc.)
    and returns the number of specified facilities"""
    try:
        value = value.lower()
        facility_type = facility_type.lower()
    except:
        return np.NaN
    
    try:
        return int(re.search(f'\d* {facility_type}', value).group().strip(f' {facility_type}'))
    except:
        if facility_type in value:
            return 1
        else:
            return 0

In [44]:
# Load the list of property advertisment dictionaries from a pickle file
with open('./Cellar/Archive/ads_so_far_20191215.pkl', 'rb') as ads_list_pickle:
            dec_19_ads_list = pickle.load(ads_list_pickle)

print(f'Number of records in 2019 December 15th ads list is: {len(dec_19_ads_list)}.')

with open('./Cellar/Archive/ads_so_far_202029.pkl', 'rb') as ads_list_pickle:
            feb_20_ads_list = pickle.load(ads_list_pickle)

print(f'Number of records in 2020 February 9th ads list is: {len(feb_20_ads_list)}.')

with open('./Cellar/latest_ads_dataset.pkl', 'rb') as ads_list_pickle:
            ads_list = pickle.load(ads_list_pickle)

print(f'Number of records in the latest ads list is: {len(ads_list)}.')

Number of records in 2019 December 15th ads list is: 3860.
Number of records in 2020 February 9th ads list is: 5398.
Number of records in the latest ads list is: 8313.


### Dictionaries to Pandas without duplicates:

1. Create a small dictionary with just property_id, title, "Listed since" and "Asking price" for both 2019 Dec and 2020 Feb lists  
    a. This will allow to get the scraped dates and increase the number of records with appropriate **listed_date** (*this is a one-off process and will not have to be repeated going forward*). 
2. Get all the columns from dictionaries
3. Add "listed_date_present", "listed_date", "is_duplicate", "is_complete_duplicate" and "rank_same_record" columns to the columns list
4. Create a pandas dataframe using the column list retrieved in step 2 & 3 as dataframe columns
5. Iterate over the ads_list and add a listed_date field (using get_listed_date function), where possible  
    a. In this step, also add another key-value pair to identify records with a successful conversion of "Listed since" value to date format  
    b. Since we are iterating already, just add "is_duplicate", "is_complete_duplicate", "is_status_change" and "is_price_change" columns (set them all to **False**)
6. Now, iterate over the ads_list and append records to the empty dataframe (from step 4)  
    a. Assign a rank to each row (rank of the same property id and title in the dataframe). Most records will simply have **1** in this column   
    b. Create a *check_duplicates*, *check_complete_duplicate*, *check_price_change* and *check_status_change*  lists in order to carry out below steps    
    c. Check if a record with the same property id and title has been seen, if so update "is_duplicate" to **True**  
    d. Whenever a duplicate flag is set to **True**, filter that record in the dataframe and set "is_duplicate" to **True** for those records (this will allow to quickly identify records that have duplicates in the dataframe)  
    e. Check if a record with the same id, title, asking price and status, has already been seen, if so set "is_complete_duplicate" to **True**  
    f. Check if a record with the same id, title and asking price has been seen, if so update "is_status_change" to **True**  
    g. Check if a record with the same id, title and status has been seen, if so update "is_price_change" to **True**     
7. As a final step, filter the dataframe to only keep **unique** records **with** a listed date (where possible, where not - keep the first record)  
8. Now, we are ready to convert columns to the right formats, clean up text values and make them numbers, etc.  



* Additional challenge: if there are duplicate records and one of them has no scraped_date and has "Listed since" = Today, while the other(s) has an actual listed_date, we can update scraped_date field of the record with no scraped date

In [45]:
# Step 1 - Create a small dictionary with just property_id, title, "Listed since" and "Asking price" for both 2019 Dec and 2020 Feb lists
scrape_date_check = ['property_id', 'title', 'Listed since', 'Asking price']
my_order = [0, 1, 3, 2]

dec19_check_list = []

# Iterate over the 2019 Dec ads list
for ad in dec_19_ads_list:
    # Create a new_ad dictionary for each ad
    new_ad = []
    # Iterate over the keys in each ad and only append those that appear in the scrape_date_check list
    for key in ad.keys():
        if key in scrape_date_check:
            new_ad.append(ad[key])
    if len(new_ad) < 4:
        new_ad_ordered = new_ad.copy()
    else:
        new_ad_ordered = [new_ad[i] for i in my_order]
    dec19_check_list.append(new_ad_ordered)
    
    
    
# Repeat the same for 2020 Feb
feb20_check_list = []            

# Iterate over the 2020 Feb ads list
for ad in feb_20_ads_list:
    # Create a new_ad dictionary for each ad
    new_ad = []
    # Iterate over the keys in each ad and only append those that appear in the scrape_date_check list
    for key in ad.keys():
        if key in scrape_date_check:
            new_ad.append(ad[key])
    if len(new_ad) < 4:
        new_ad_ordered = new_ad.copy()
    else:
        new_ad_ordered = [new_ad[i] for i in my_order]
    feb20_check_list.append(new_ad_ordered)



# i. Assign a scraped date where possible
for ad in ads_list:
    if [ad.get(key) for key in scrape_date_check] in dec19_check_list:
        ad['scraped_date'] = dt.date(2019, 12, 15)
    elif [ad.get(key) for key in scrape_date_check] in feb20_check_list:
        ad['scraped_date'] = dt.date(2020, 2, 9)
    else:
        pass

In [46]:
# Step 2 - This is to create a pandas dataframe with the column sorting as in the dictionaries
column_list = []

for ad in ads_list:
    for feat_name in list(ad.keys()):
        if feat_name not in column_list:
            column_list.append(feat_name)      
            

# Step 3 - Add "Listed_date_present", "listed_date" and "is_duplicate" columns
column_list.append('listed_date')
column_list.append('listed_date_present')
column_list.append('is_duplicate')
column_list.append('is_complete_duplicate')
column_list.append('rank_same_record')
column_list.append('is_latest_record')


# Step 4 - Initiate the dataframe with the desired columns
ads_df = pd.DataFrame(columns=column_list)


# Step 5 - Iterate over the ads_list and replace "Listed since" field with a date (using get_listed_date function), where possible 
    #i. #As part of this step, add another key-value pair to identify records with a successful conversion of "Listed since" to date
    #ii. Since we are iterating already, just add the "is_duplicate" column and set it to false
for ad in ads_list:
    ad['listed_date'], ad['listed_date_present'] = string_to_date(ad['Listed since'])
    ad['is_duplicate'] = False
    ad['is_complete_duplicate'] = False
    ad['is_price_change'] = False
    ad['is_status_change'] = False
    ad['is_latest_record'] = False
    
    
# Step 6 - iterate over the ads_list and append unique records to the empty dataframe (from step 2) 
    # i. Create a check_ads_list with only columns used to check for duplicates
    # ii. In the process check for duplicates and update "is_duplicate" value, if a record with the same:
        # property_id, title, asking price and status, has already been seen
check_duplicates = ['property_id', 'title']
check_complete_duplicate = ['property_id', 'title', 'Asking price', 'Status']
check_price_change = ['property_id', 'title', 'Status']
check_status_change = ['property_id', 'title', 'Asking price']
ads_seen_list = []

for ad in ads_list:
    # Keeps track of all the ads seen so far (dpl = duplicate)
    ad_check_dpl_value = [ad.get(key) for key in check_duplicates]
    ads_seen_list.append(ad_check_dpl_value)
    
    # Assign a rank_same_record value
    ad['rank_same_record'] = ads_seen_list.count(ad_check_dpl_value)
    
    # Create a list with columns cords for checking if a record is a complete duplicate (cdpl)
    ad_check_cdpl_value = [ad.get(key) for key in check_complete_duplicate]

    # Creates a smaller list of key values to check if a record already exists, but had a change in price or status (pc = price change, sc = status change)
    ad_check_pc_value = [ad.get(key) for key in check_price_change]
    ad_check_sc_value = [ad.get(key) for key in check_status_change]
    
    # Checks if the ad is in the ads_df already, this updates is_duplicate, is_price_change and is_status_change columns
    if ad_check_dpl_value in ads_df[check_duplicates].values.tolist():
        ad['is_duplicate'] = True
        # Now filter the dataframe to find all the records with the same id and title, and update their "is_duplicate" column value
        ads_df.loc[(ads_df['property_id'] == ad_check_dpl_value[0]) & (ads_df['title'] == ad_check_dpl_value[1]), 'is_duplicate'] = True
    if ad_check_cdpl_value in ads_df[check_complete_duplicate].values.tolist():
        ad['is_complete_duplicate'] = True
        # Now filter the dataframe to find all the records with the same id and title, and update their "is_complete_duplicate" column value
        ads_df.loc[(ads_df['property_id'] == ad_check_cdpl_value[0]) & (ads_df['title'] == ad_check_cdpl_value[1]), 'is_complete_duplicate'] = True
    if ad['is_complete_duplicate'] == False and ad_check_pc_value in ads_df[['property_id', 'title', 'Status']].values.tolist():
        ad['is_price_change'] = True
    if ad['is_complete_duplicate'] == False and ad_check_sc_value in ads_df[['property_id', 'title', 'Asking price']].values.tolist():
        ad['is_status_change'] = True  
        
    # Finally, append the advert into the dataframe
    ads_df = ads_df.append(ad, ignore_index=True)
        
print(f'Original ads dataframe had {len(ads_df)} non-unique records.')
        
    
# Step x - Assign a value (True / False) to each row's is_latest_record column 
# (to later be able to only filter on unique records and pick the latest)
# latest_ads_helper_df = ads_df.groupby('property_id')['rank_same_record'].max().reset_index()

# for index, row in ads_df.iterrows():
#     if row[['property_id', 'rank_same_record']].values.tolist() in latest_ads_helper_df.values.tolist():
#         ads_df.at[index, 'is_latest_record'] = True
    
    
# Step 7 - filter the dataframe to only keep unique records with a listed date (where possible)
# Create a dataframe with all the records that have a listed_date 
all_ids_with_listed_date = ads_df[(ads_df['listed_date_present'] == True)].groupby('property_id')['listed_date_present'].nunique().reset_index()['property_id'].tolist()
print(f'All ads with listed date list had the total of {len(all_ids_with_listed_date)} unique property ids.')

clean_ads_df = get_clean_df(ads_df, all_ids_with_listed_date)
print(f'Clean ads dataframe has {len(clean_ads_df)} unique records. Total number of records removed is {len(ads_df) - len(clean_ads_df)}.')

Original ads dataframe had 8313 non-unique records.
All ads with listed date list had the total of 1501 unique property ids.
Clean ads dataframe has 6845 unique records. Total number of records removed is 1468.


In [47]:
clean_ads_df[clean_ads_df['property_id'] == 87099199]
# clean_ads_df = get_clean_df(ads_df, all_ids_with_listed_date)
# print(f'Clean ads dataframe has {len(clean_ads_df)} unique records. Total number of records removed is {len(ads_df) - len(clean_ads_df)}.')

Unnamed: 0,property_link,property_id,title,address,price,neighbourhood,Transfer of ownership,Asking price,Asking price per m²,Listed since,Status,Acceptance,VVE (Owners Association) contribution,Construction,Type apartment,Building type,Year of construction,Surface areas and volume,Areas,Living area,Exterior space attached to the building,Volume in cubic meters,Layout,Number of rooms,Number of bath rooms,Bathroom facilities,Number of residential layers (stories),Energy,Energy label,Insulation,Heating,Hot water,CH boiler,Cadastral data,Exterior space,Location,Balcony/roof garden,VVE (Owners Association) checklist,Registration with KvK,Annual meeting,Periodic contribution,Reserve fund present,Maintenance plan,Building insurance,scraped_date,Specific,Type of roof,Other space inside the building,Located at,Facilities,...,Parking,Type of parking facilities,Quality marks,External storage space,Kind of house,Plot size,Garden,Back garden,Garden location,Facilities_Storage space,Garage,Type of garage,Capacity,Facilities_Garage,Construction period,Insulation_Garage,Accessibility,Original asking price,Service charges,Sun terrace,Provisional energy label,Front garden,Patio/atrium,Insulation_Storage space,Rental price,Deposit,Rental fees,Rental agreement,Ownership situation,Purchase combination,Side garden,Location_Exterior space,Commercial property,Office space,Consulting rooms,Auction,Price,Auction date,Auction period,Area,First rental price,Number of stories,Type of auction,Auction party,listed_date,is_duplicate,rank_same_record,is_latest_record,is_price_change,is_status_change
4739,https://www.funda.nl/en/koop/amsterdam/apparte...,87099199,Geuzenkade 83 1,1056 KP Amsterdam,"€ 309,500 k.k.","Geuzenbuurt, Amsterdam",,"€ 309,500 k.k.","€ 6,316","January 31, 2020",Available,Available in consultation,€ 97 per month,,Upstairs apartment (apartment),Resale property,1934,,,49 m²,6 m²,167 m³,,3 rooms (2 bedrooms),1 separate toilet,,1 residential layer (story),,,,,,,,,,Balcony present,,Yes,Yes,Yes (€ 97 per month),Yes,Yes,Yes,2020-02-09,,,,,,...,,Paid parking,,4 m²,,,,,,,,,,,,,,,,,G What does this mean?,,,,,,,,,,,,,,,,,,,,,,,,2020-01-31,False,1,False,0.0,0.0
7875,https://www.funda.nl/en/koop/amsterdam/apparte...,87099199,Geuzenkade 83 1,1056 KP Amsterdam,"€ 289,500 k.k.","Geuzenbuurt, Amsterdam",,"€ 289,500 k.k.","€ 5,908",8 weeks,Sold under reservation,Available in consultation,€ 97 per month,,Upstairs apartment (apartment),Resale property,1934,,,49 m²,6 m²,167 m³,,3 rooms (2 bedrooms),1 separate toilet,,,,,,,,,,,,Balcony present,,Yes,Yes,Yes (€ 97 per month),Yes,Yes,Yes,2020-03-28,,,,,,...,,Paid parking,,4 m²,,,,,,,,,,,,,,"€ 309,500 k.k.",,,G What does this mean?,,,,,,,,,,,,,,,,,,,,,1 story,,,,False,2,False,0.0,0.0


In [48]:
clean_ads_df[clean_ads_df['property_id'] == 40051099]    #41420958
# ads_df[ads_df['property_id'] == 40051099]
# ad_check_dpl_value = [40051099, 'Prinsengracht 759 hs']
# ads_df[(ads_df['property_id'] == ad_check_dpl_value[0]) & (ads_df['title'] == ad_check_dpl_value[1])]['is_duplicate']

Unnamed: 0,property_link,property_id,title,address,price,neighbourhood,Transfer of ownership,Asking price,Asking price per m²,Listed since,Status,Acceptance,VVE (Owners Association) contribution,Construction,Type apartment,Building type,Year of construction,Surface areas and volume,Areas,Living area,Exterior space attached to the building,Volume in cubic meters,Layout,Number of rooms,Number of bath rooms,Bathroom facilities,Number of residential layers (stories),Energy,Energy label,Insulation,Heating,Hot water,CH boiler,Cadastral data,Exterior space,Location,Balcony/roof garden,VVE (Owners Association) checklist,Registration with KvK,Annual meeting,Periodic contribution,Reserve fund present,Maintenance plan,Building insurance,scraped_date,Specific,Type of roof,Other space inside the building,Located at,Facilities,...,Parking,Type of parking facilities,Quality marks,External storage space,Kind of house,Plot size,Garden,Back garden,Garden location,Facilities_Storage space,Garage,Type of garage,Capacity,Facilities_Garage,Construction period,Insulation_Garage,Accessibility,Original asking price,Service charges,Sun terrace,Provisional energy label,Front garden,Patio/atrium,Insulation_Storage space,Rental price,Deposit,Rental fees,Rental agreement,Ownership situation,Purchase combination,Side garden,Location_Exterior space,Commercial property,Office space,Consulting rooms,Auction,Price,Auction date,Auction period,Area,First rental price,Number of stories,Type of auction,Auction party,listed_date,is_duplicate,rank_same_record,is_latest_record,is_price_change,is_status_change
1845,https://www.funda.nl/en/koop/amsterdam/apparte...,40051099,Prinsengracht 759 hs,1017 JZ Amsterdam,"€ 2,690,000 v.o.n.","Grachtengordel-Zuid, Amsterdam",,"€ 2,690,000 v.o.n.","€ 10,760",6+ months,Available,Available in consultation,€ 468 per month,,Ground-floor + upstairs apartment (apartment),New property,2019,,,250 m²,51 m²,700 m³,,5 rooms (3 bedrooms),1 separate toilet,,2 residential layers (stories),,Not available,Completely insulated,Heat pump,CH boiler,,,,In center,,,No,No,No,No,No,No,2019-12-15,,,,Ground floor,"Mechanical ventilation, TV via cable and slidi...",...,,Parking garage,,,,,Back garden,,,,,Parking place,,,,,,"€ 2,950,000 v.o.n.",,,,,,,,,,,,,,,,,,,,,,,,,,,,False,1,False,0.0,0.0


In [49]:
# # This little step is to find all the records that were not duplicates when price was included, but became such, 
# # when price was removed from the list of columns. The same can be done for Status (by removing Status from the "3field_df" list of columns)

# duplicates_df = ads_df[ads_df.duplicated(subset=['property_id', 'address', 'Asking price', 'Status'], keep=False) == True]
# price_chng_df = ads_df[ads_df.duplicated(subset=['property_id', 'address', 'Status'], keep=False) == True]
# status_chng_df = ads_df[ads_df.duplicated(subset=['property_id', 'address', 'Asking price'], keep=False) == True]

# just_price_change_df = pd.DataFrame()
# counter = 0

# for i, row in price_chng_df.iterrows():
#     if row[['property_id', 'address', 'Status']].values.tolist() not in duplicates_df[['property_id', 'address', 'Status']].values.tolist():
#         counter += 1
#         just_price_change_df = just_price_change_df.append(row)

# print(counter)
# just_price_change_df

In [50]:
clean_ads_df[clean_ads_df['property_id'] == 87099199]       #40051099, 41420958, 87804888, 87804860, 87998136, 87998780

Unnamed: 0,property_link,property_id,title,address,price,neighbourhood,Transfer of ownership,Asking price,Asking price per m²,Listed since,Status,Acceptance,VVE (Owners Association) contribution,Construction,Type apartment,Building type,Year of construction,Surface areas and volume,Areas,Living area,Exterior space attached to the building,Volume in cubic meters,Layout,Number of rooms,Number of bath rooms,Bathroom facilities,Number of residential layers (stories),Energy,Energy label,Insulation,Heating,Hot water,CH boiler,Cadastral data,Exterior space,Location,Balcony/roof garden,VVE (Owners Association) checklist,Registration with KvK,Annual meeting,Periodic contribution,Reserve fund present,Maintenance plan,Building insurance,scraped_date,Specific,Type of roof,Other space inside the building,Located at,Facilities,...,Parking,Type of parking facilities,Quality marks,External storage space,Kind of house,Plot size,Garden,Back garden,Garden location,Facilities_Storage space,Garage,Type of garage,Capacity,Facilities_Garage,Construction period,Insulation_Garage,Accessibility,Original asking price,Service charges,Sun terrace,Provisional energy label,Front garden,Patio/atrium,Insulation_Storage space,Rental price,Deposit,Rental fees,Rental agreement,Ownership situation,Purchase combination,Side garden,Location_Exterior space,Commercial property,Office space,Consulting rooms,Auction,Price,Auction date,Auction period,Area,First rental price,Number of stories,Type of auction,Auction party,listed_date,is_duplicate,rank_same_record,is_latest_record,is_price_change,is_status_change
4739,https://www.funda.nl/en/koop/amsterdam/apparte...,87099199,Geuzenkade 83 1,1056 KP Amsterdam,"€ 309,500 k.k.","Geuzenbuurt, Amsterdam",,"€ 309,500 k.k.","€ 6,316","January 31, 2020",Available,Available in consultation,€ 97 per month,,Upstairs apartment (apartment),Resale property,1934,,,49 m²,6 m²,167 m³,,3 rooms (2 bedrooms),1 separate toilet,,1 residential layer (story),,,,,,,,,,Balcony present,,Yes,Yes,Yes (€ 97 per month),Yes,Yes,Yes,2020-02-09,,,,,,...,,Paid parking,,4 m²,,,,,,,,,,,,,,,,,G What does this mean?,,,,,,,,,,,,,,,,,,,,,,,,2020-01-31,False,1,False,0.0,0.0
7875,https://www.funda.nl/en/koop/amsterdam/apparte...,87099199,Geuzenkade 83 1,1056 KP Amsterdam,"€ 289,500 k.k.","Geuzenbuurt, Amsterdam",,"€ 289,500 k.k.","€ 5,908",8 weeks,Sold under reservation,Available in consultation,€ 97 per month,,Upstairs apartment (apartment),Resale property,1934,,,49 m²,6 m²,167 m³,,3 rooms (2 bedrooms),1 separate toilet,,,,,,,,,,,,Balcony present,,Yes,Yes,Yes (€ 97 per month),Yes,Yes,Yes,2020-03-28,,,,,,...,,Paid parking,,4 m²,,,,,,,,,,,,,,"€ 309,500 k.k.",,,G What does this mean?,,,,,,,,,,,,,,,,,,,,,1 story,,,,False,2,False,0.0,0.0


In [51]:
# Duplicate values: 87804888, 87804860, 87998136

In [52]:
# test_df = ads_df[ads_df['property_id'] == 40051099]

# test_df['listed_date'] = test_df.apply(lambda x: get_listed_date(x['Listed since'], x['scraped_date']), axis=1)
# test_df

In [53]:
# Step 8 - Now, we are ready to convert columns to the right formats, clean up text values and make them numbers, etc.

clean_ads_df.drop(columns=['price'], inplace=True)
clean_ads_df['property_id'] = clean_ads_df['property_id'].apply(int)
clean_ads_df['listed_date'] = clean_ads_df.apply(lambda x: get_listed_date(x['Listed since'], x['scraped_date']), axis=1)
clean_ads_df['address'] = clean_ads_df['title']+', '+clean_ads_df['address']
clean_ads_df['Asking price'] = clean_ads_df['Asking price'].apply(get_int)
clean_ads_df['Asking price per m²'] = clean_ads_df['Asking price per m²'].apply(get_int)
clean_ads_df['VVE (Owners Association) contribution'] = clean_ads_df['VVE (Owners Association) contribution'].apply(get_int)
clean_ads_df['Year of construction'] = clean_ads_df['Year of construction'].apply(get_int)
clean_ads_df['Living area'] = clean_ads_df['Living area'].apply(get_int)
clean_ads_df['Exterior space attached to the building'] = clean_ads_df['Exterior space attached to the building'].apply(get_int)
clean_ads_df['Volume in cubic meters'] = clean_ads_df['Volume in cubic meters'].apply(get_int)
clean_ads_df['Rooms'] = clean_ads_df['Number of rooms'].apply(get_rooms, room_type='room')
clean_ads_df['Bedrooms'] = clean_ads_df['Number of rooms'].apply(get_rooms, room_type='bedroom')
clean_ads_df['Bathrooms'] = clean_ads_df['Number of bath rooms'].apply(get_rooms, room_type='bathroom')
clean_ads_df['Toilets'] = clean_ads_df['Number of bath rooms'].apply(get_rooms, room_type='toilet')
clean_ads_df['Has_Bathtub'] = clean_ads_df['Bathroom facilities'].apply(get_bath_flag)
clean_ads_df['Baths'] = clean_ads_df['Bathroom facilities'].apply(get_facilities, facility_type='bath')
clean_ads_df['Number of Toilets'] = clean_ads_df['Bathroom facilities'].apply(get_facilities, facility_type='toilet')
clean_ads_df['Showers'] = clean_ads_df['Bathroom facilities'].apply(get_facilities, facility_type='shower')
clean_ads_df['Energy label'] = clean_ads_df['Energy label'].apply(get_energy_label)
clean_ads_df['Provisional energy label'] = clean_ads_df['Provisional energy label'].apply(get_energy_label)


clean_ads_df.rename(columns={'Asking price': 'Asking price (€)', 'Asking price per m²': 'Asking price per m² (€)', 
                       'VVE (Owners Association) contribution': 'VVE contribution (monthly) (€)',
                      'Living area': 'Living area (m²)', 'Volume in cubic meters': 'Volume (m³)'}, inplace=True)

In [54]:
clean_ads_df.head(5)

Unnamed: 0,property_link,property_id,title,address,neighbourhood,Transfer of ownership,Asking price (€),Asking price per m² (€),Listed since,Status,Acceptance,VVE contribution (monthly) (€),Construction,Type apartment,Building type,Year of construction,Surface areas and volume,Areas,Living area (m²),Exterior space attached to the building,Volume (m³),Layout,Number of rooms,Number of bath rooms,Bathroom facilities,Number of residential layers (stories),Energy,Energy label,Insulation,Heating,Hot water,CH boiler,Cadastral data,Exterior space,Location,Balcony/roof garden,VVE (Owners Association) checklist,Registration with KvK,Annual meeting,Periodic contribution,Reserve fund present,Maintenance plan,Building insurance,scraped_date,Specific,Type of roof,Other space inside the building,Located at,Facilities,Storage space,...,Garden location,Facilities_Storage space,Garage,Type of garage,Capacity,Facilities_Garage,Construction period,Insulation_Garage,Accessibility,Original asking price,Service charges,Sun terrace,Provisional energy label,Front garden,Patio/atrium,Insulation_Storage space,Rental price,Deposit,Rental fees,Rental agreement,Ownership situation,Purchase combination,Side garden,Location_Exterior space,Commercial property,Office space,Consulting rooms,Auction,Price,Auction date,Auction period,Area,First rental price,Number of stories,Type of auction,Auction party,listed_date,is_duplicate,rank_same_record,is_latest_record,is_price_change,is_status_change,Rooms,Bedrooms,Bathrooms,Toilets,Has_Bathtub,Baths,Number of Toilets,Showers
0,https://www.funda.nl/en/koop/amsterdam/apparte...,41580542,Jacob van Lennepkade 2 1/2,"Jacob van Lennepkade 2 1/2, 1053 MJ Amsterdam","Van Lennepbuurt, Amsterdam",,1000000,6452,2 weeks,Available,Available in consultation,200.0,,Upstairs apartment (double upstairs apartment),Resale property,1906,,,155,12,537,,6 rooms (4 bedrooms),1 bathroom and 1 separate toilet,"Bath, shower and toilet",2 residential layers (stories),,E,Partly double glazed,CH boiler,CH boiler,Combination boiler,AMSTERDAM Q 8745; Ownership situation; Full ow...,,"Alongside water, in residential district and u...",Balcony present,,Yes,Yes,Yes (€ 200 per month),Yes,Yes,Yes,2019-12-15,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2019-12-01,False,1,False,0.0,0.0,6.0,4.0,1.0,1.0,True,1.0,1.0,1.0
2,https://www.funda.nl/en/koop/amsterdam/apparte...,87974301,Swammerdamstraat 6 2,"Swammerdamstraat 6 2, 1091 RT Amsterdam","Weesperzijde, Amsterdam",,675000,6888,2 weeks,Under offer,Available in consultation,200.0,,Mezzanine (apartment),Resale property,1882,,,98,11,353,,4 rooms (3 bedrooms),1 bathroom and 1 separate toilet,Toilet,1 residential layer (story),,D,Floor insulation and partly double glazed,CH boiler,CH boiler,"HR (gas-fired combination boiler from 2013, in...",AMSTERDAM S 8359; Ownership situation; Full ow...,,Alongside a quiet road and in residential dist...,Balcony present,,Yes,Yes,Yes (€ 200 per month),Yes,Yes,Yes,2019-12-15,,Flat roof covered with asphalt roofing,,3rd level of residential structure,"Mechanical ventilation, TV via cable and flue",,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2019-12-01,True,1,False,0.0,0.0,4.0,3.0,1.0,1.0,False,0.0,1.0,0.0
3,https://www.funda.nl/en/koop/amsterdam/huis-41...,41532480,Peter Martensstraat 70,"Peter Martensstraat 70, 1087 NA Amsterdam","IJburg Zuid, Amsterdam",,585000,4432,"December 6, 2019",Available,Available in consultation,,,,Resale property,2011,,,132,11,474,,5 rooms (4 bedrooms),1 bathroom and 1 separate toilet,Toilet,3 residential layers (stories),,A,Double glazing and completely insulated,District heating,Central facility,,AMSTERDAM AU 2867; Area; 120 m²; Ownership sit...,,Alongside a quiet road and in residential dist...,Roof terrace present and balcony present,,,,,,,,2019-12-15,,Flat roof,,,"Glass fiber cable, mechanical ventilation, sli...",,...,Located at the southeast,Electricity,,Parking place,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2019-12-06,True,1,False,0.0,0.0,5.0,4.0,1.0,1.0,False,0.0,1.0,0.0
4,https://www.funda.nl/en/koop/amsterdam/huis-41...,41531188,Monte Viso 5,"Monte Viso 5, 1060 PE Amsterdam","Middelveldsche Akerpolder, Amsterdam",,475000,3958,"December 6, 2019",Available,Available in consultation,,,,Resale property,1995,,,120,14,445,,5 rooms (3 bedrooms),1 bathroom and 1 separate toilet,"Bath, shower and toilet",3 residential layers (stories),,B,Completely insulated,CH boiler,CH boiler,"Intergas HR (gas-fired from 2012, in ownership)",SLOTEN NOORD-HOLLAND G 2405; Area; 152 m²; Own...,,"Alongside water, alongside a quiet road, in re...",Roof terrace present,,,,,,,,2019-12-15,,Flat roof covered with asphalt roofing,14 m²,,Mechanical ventilation and TV via cable,,...,Located at the southwest,,,"Built-in, parking place and garage with carport",1 car,Heating and electricity,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2019-12-06,True,1,False,0.0,0.0,5.0,3.0,1.0,1.0,True,1.0,1.0,1.0
5,https://www.funda.nl/en/koop/amsterdam/huis-41...,41537580,Spankerenkade 5,"Spankerenkade 5, 1107 ZX Amsterdam","Holendrecht/Reigersbos, Amsterdam",,299500,3403,"December 3, 2019",Available,Available in consultation,,,,Resale property,1985,,,88,1,312,,3 rooms (2 bedrooms),1 bathroom and 1 separate toilet,Shower and toilet,2 residential layers (stories),,A,"Roof insulation, insulated walls and floor ins...",CH boiler,CH boiler,Intergas (gas-fired combination boiler from 20...,WEESPERKARSPEL L 3296; Area; 130 m²; Ownership...,,"Alongside a quiet road, in residential distric...",,,,,,,,,2019-12-15,,Flat roof,1 m²,,,,...,Located at the west accessible via the rear,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2019-12-03,True,1,False,0.0,0.0,3.0,2.0,1.0,1.0,False,0.0,1.0,1.0


In [55]:
conn = sqlite3.connect('./Database/ams_market_watch.db')  # You can create a new database by changing the name within the quotes
# cursor = conn.cursor() # The database will be saved in the location where your 'py' file is saved

In [56]:
# While creating the initial database, need to replace existing table to load all the records.
# Once that is set up, will use the append new records statement instead
clean_ads_df.to_sql('funda_ads', conn, if_exists='replace', index=False)

In [60]:
current_db_df = pd.read_sql_query("SELECT * FROM funda_ads", con=conn)
# current_db_df[current_db_df[['property_id', 'title']].duplicated()]
# current_db_df[current_db_df['property_id'] == 87099199]
current_db_df.count()[0]

6845

### Let's plot some stuff! :)

In [58]:
# sns.scatterplot(current_db_df['Bedrooms'].dropna(), current_db_df['Asking price per m² (€)'].dropna())