# Master Thesis - Data Scraping


In [1]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from tqdm import tqdm
from bs4 import BeautifulSoup
from urllib.request import urlopen
import re
import json
import pandas as pd
import time
import requests
import concurrent.futures
import lxml

## Functions for individual webpages 

- Functions for scariping Boligsiden.dk

In [2]:
#### Get URL for Boligsiden search for specified period in selected Kommune

def get_url_boligsiden(kommune, startdate, enddate, p):
    url = 'http://www.boligsiden.dk/salgspris/solgt/alle/{}'
    params = '?periode.from={}&periode.to={}&displaytab=mergedtab&sort' \
             '=salgsdato&salgstype=%5Bobject%20Object%5D&kommune={}'
    full_url = url + params
    return full_url.format(p, startdate, enddate, kommune)

#### Get number of pages for Boligsiden search

def get_max_pages_boligsiden(url):
    options = webdriver.chrome.options.Options()
    options.add_argument('--headless')
    options.add_argument('--no-sandbox')

    driver = webdriver.Chrome('/Users/Mikkel/Documents/Drivers/chromedriver', options=options)
    driver.get(url)
    
    page_text = driver.find_element_by_class_name("salesprice-result").text

    last_page_num = (page_text.split("af ")[1]).split("\n")[0]
    return last_page_num

#### Get all address links on search page

def get_all_urls_on_page_boligsiden(url):
    options = webdriver.chrome.options.Options()
    options.add_argument('--headless')
    options.add_argument('--no-sandbox')

    driver = webdriver.Chrome('/Users/Mikkel/Documents/Drivers/chromedriver', options=options)
    driver.get(url)

    all_https = []
    with_reentries_https = []

    for elem in driver.find_elements_by_tag_name('a'):
        all_https.append(elem.get_attribute("href"))

    #bolig-links wanted appear multiple times, so we take away all single time occuring links
    for i in range(len(all_https)):
        if all_https[i] in all_https[:i]:
            with_reentries_https.append(all_https[i])

    #Take away first two entries, which are not bolig links
    with_reentries_https = with_reentries_https[2:]

    reduced_list = list(set(with_reentries_https))

    #To make sure no other links are included
    boliger_https = []
    condition = 'https://www.boligsiden.dk/adresse/'
    entry = 0
    error_count = 0
    for i in reduced_list:
        if isinstance(i, str):
            if condition in i:
                boliger_https.append(i)

    return boliger_https

#### Get list of all address URLs for search

def get_all_links_boligsiden(kommune, startdate, enddate):
    # Returns first https-page with given variables
    first_page = get_url_boligsiden(kommune, startdate, enddate, 1)

    # Getting number of total pages
    total_pages = get_max_pages_boligsiden(first_page)

    # Empty lists
    link_to_all_pages = []
    list_of_all_pages = []

    # Collects a list with all the pages that we want to collect
    for x in tqdm(range(int(total_pages))):
        all_pages = get_url_boligsiden(kommune, startdate, enddate, x + 1)
        link_to_all_pages.append(all_pages)

        page_list = get_all_urls_on_page_boligsiden(link_to_all_pages[x])
        list_of_all_pages.extend(page_list)

    # Returns list with all the wanted url's
    return (list_of_all_pages)

#### Scrape information for single address on address URL 

def get_simple_single_page_boligsiden(url):

    url = url
    html = urlopen(url)
    soup = BeautifulSoup(html.read(), 'html.parser')
    head = str(soup.find('head'))
    try:
        json_string = re.search(r'__bs_addresspresentation__ = ([^;]*)', head).group(1)
        data = json.loads(json_string)
        df1 = pd.json_normalize(data)
        df2 = pd.DataFrame()
    except:
        json_string = re.search(r'__bs_propertypresentation__ = ([^;]*)', head).group(1)
        data = json.loads(json_string)
        df2 = pd.json_normalize(data)
        df1 = pd.DataFrame()

    return df1, df2

#### Collect scraped information for all addresses in two dataframes

def get_data_boligsiden(links):
    df1 = pd.DataFrame()
    df2 = pd.DataFrame()

    for x in tqdm(range(0, len(links))):
        try:
            df_pages1, df_pages2 = get_simple_single_page_boligsiden(links[x])
            df1 = pd.concat([df1, df_pages1])
            df2 = pd.concat([df2, df_pages2])
        except:
            pass


    return df1, df2

- Functions for scraping DinGeo.dk

In [3]:
#### Get DinGeo-URLs for all addresses in Boligsiden dataframes 

def get_geolinks1(df):
    df["dingeo_link"] = ""

    for x in range(0, len(df)):
        if '-' in (df['address.street'][x]):
            df['address.street'][x] = df['address.street'].str.split('-').str[0][x] + '--' \
                                      + df['address.street'].str.split('-').str[1][x]

        if ',' in (df['address.street'][x]):
            add_part = str(df['address.postalId'][x]) + '-' + df['address.city'][x].replace(" ", "-") + '/' \
                       + df['address.street'].str.split(',').str[0][x].replace(" ","-") + '/' \
                       + df['address.street'].str.split(', ').str[1][x].replace(".", "").replace(" ", "-")
            url = 'https://www.dingeo.dk/adresse/' + add_part
        elif 'Adressen er ikke tilgængelig' in (df['address.street'][x]):
            url = 'Utilgængelig'
        else:
            add_part = str(df['address.postalId'][x]) + '-' + df['address.city'][x].replace(" ", "-") + '/' \
                       + df['address.street'].str.split(',').str[0][x].replace(" ","-")
            url = 'https://www.dingeo.dk/adresse/' + add_part

        if '-lejl-' in url:
            url = url.replace('-lejl-','-')

        df['dingeo_link'][x] = url

    return df

def get_geolinks2(df):
    df["dingeo_link"] = ""

    for x in range(0, len(df)):
        if '-' in (df['property.address'][x]):
            df['property.address'][x] = df['property.address'].str.split('-').str[0][x] + '--' \
                                        + df['property.address'].str.split('-').str[1][x]

        if ',' in (df['property.address'][x]):
            ad_part = str(df['property.postal'][x]) + '-' + df['property.city'][x].replace(" ", "-") + '/' \
                      + df['property.address'].str.split(',').str[0][x].replace(" ","-") + '/' \
                      + df['property.address'].str.split(', ').str[1][x].replace(".", "").replace(" ", "-")
            url = 'https://www.dingeo.dk/adresse/' + ad_part
        elif 'Adressen er ikke tilgængelig' in (df['property.address'][x]):
            url = 'Utilgængelig'
        else:
            ad_part = str(df['property.postal'][x]) + '-' + df['property.city'][x].replace(" ", "-") + '/' \
                      + df['property.address'].str.split(',').str[0][x].replace(" ","-")
            url = 'https://www.dingeo.dk/adresse/' + ad_part

        if '-lejl-' in url:
            url = url.replace('-lejl-','-')

        df['dingeo_link'][x] = url

    return df

#### Scrape information for each individual address on DinGeo.dk

def dingeo_page(url):
    url = url

    resp = requests.get(url)
    soup = BeautifulSoup(resp.text, 'html.parser')

    # Dictionary
    data = {}
    data['dingeo_link'] = url
    try:
        data['Radonrisiko'] = [soup.find_all("div", {"id": 'radon'})[0].find_all("strong")[0].get_text()]
    except:
        pass

    if 'ikke registreret trafikstøj' in soup.find_all("div", {"id": 'trafikstoej'})[0].get_text():
        data['Støjmåling'] = ['Ingen trafikstøj']
    elif 'mangler desværre at indsamle trafikstøj' in soup.find_all("div", {"id": 'trafikstoej'})[0].get_text():
        data['Støjmåling'] = ['Mangler']
    else:
        data['Støjmåling'] = [soup.find_all("div", {"id": 'trafikstoej'})[0].find_all("b")[1].get_text()]

    data['Oversvømmelsesrisiko_skybrud'] = [soup.find_all("div", {"id": 'skybrud'})[0].find_all("b")[0].get_text()]
    data['Meter_over_havet'] = [soup.find_all("div", {"id": 'stormflod'})[0].find_all("b")[0].get_text()]

    table_0 = pd.read_html(str(soup.find_all('table')))[0].iloc[:, 0:2]
    table_0 = table_0.set_axis(['Tekst', 'Værdi'], axis=1, inplace=False)

    table_1 = pd.read_html(str(soup.find_all('table')))[1].iloc[:, 0:2]
    table_1 = table_1.set_axis(['Tekst', 'Værdi'], axis=1, inplace=False)

    table_2 = pd.read_html(str(soup.find_all('table')))[2].iloc[:, 0:2]
    table_2 = table_2.set_axis(['Tekst', 'Værdi'], axis=1, inplace=False)

    table_3 = pd.read_html(str(soup.find_all('table')))[3:-2]
    table_3 = pd.concat(table_3).iloc[:, 0:2]
    table_3 = table_3.set_axis(['Tekst', 'Værdi'], axis=1, inplace=False)

    table = pd.concat([table_0, table_1, table_2, table_3])

    table = table.loc[table['Tekst'].isin(['Anvendelse', 'Opførselsesår', 'Ombygningsår', 'Fredning',
                                           'Køkkenforhold', 'Antal Etager', 'Antal toiletter', 'Antal badeværelser',
                                           'Antal værelser',
                                           'Ydervægsmateriale', 'Tagmateriale', 'Varmeinstallation',
                                           'Bygning, Samlet areal', 'Boligstørrelse', 'Kælder', 'Vægtet Areal'])]
    mydict = dict(zip(table.Tekst, list(table.Værdi)))
    data.update(mydict)

    try:
        if 'ikke finde energimærke' in soup.find_all("div", {"id": 'energimaerke'})[0].get_text():
            data['Energimærke'] = ['Mangler']
        else:
            data['Energimærke'] = [soup.find_all("div", {"id": 'energimaerke'})[0].find_all("p")[0].get_text()[-3:-2]]
        data['Indbrudsrisiko'] = [soup.find_all("div", {"id": 'indbrud'})[0].find_all("u")[0].get_text()]
    except:
        pass

    try:
        if 'ikke fredet' in str(soup.find_all("div", {"id": 'fbb'})[0].find_all("h2")[0]):
            data['Bevaringsværdig'] = [0]
        elif 'Bygningen er Bevaringsværdig' in str(soup.find_all("div", {"id": 'fbb'})[0].find_all("h2")[0]):
            data['Bevaringsværdig'] = re.findall(r'\d+', str(soup.find_all("div", {"id": 'fbb'})[0].find_all("p")[4]))
        elif 'Fejl ved opslag af' in str(soup.find_all("div", {"id": 'fbb'})[0].find_all("h2")[0]):
            data['Bevaringsværdig'] = 'Mangler' #Seems to be flaw on site, all get mangler
        else:
            data['Bevaringsværdig'] = 'Ukendt'
    except:
        pass

    try:
        data['Største_parti'] = re.findall(r'valg/(.*?)(?<!\\).png',
                                           str(soup.find_all("div", {"id": 'valgdata'})[0].find_all('h2')[0]))
        data['Valgdeltagelse'] = \
        re.findall("\d+.\d+", str(soup.find_all("div", {"id": 'valgdata'})[0].find_all('p')[1]))[1]
        data['Afstemningsområde'] = [soup.find_all("div", {"id": 'valgdata'})[0].find_all("strong")[0].get_text()]
    except:
        pass

    try:
        url_vurdering = url + '/vurdering'
        resp_vurdering = requests.get(url_vurdering)
        soup_vurdering = BeautifulSoup(resp_vurdering.text, 'html.parser')
        data['AVM_pris'] = \
        soup_vurdering.find_all("div", {"id": 'avmnumber'})[0].get_text() #made correction
    except:
        pass

        # Make dataframe
    df_page = pd.DataFrame(data)

    return df_page

#### Collect all scraped data from DinGeo for the addresses and ad to Boligsiden-dataframes

def for_threading(url):

    try:
        df_pages = dingeo_page(url)
        # df_geo = pd.concat([df_geo, df_pages])
        #   time.sleep(1)
        return df_pages
    except:
        pass

def add_dingeo(df):

    url_list = df['dingeo_link'].tolist()

    df_geo = pd.DataFrame()

    with concurrent.futures.ThreadPoolExecutor() as executor:
        results = tqdm(executor.map(for_threading, url_list))

        for result in results:
            df_geo = pd.concat([df_geo, result])


    df_Boligsiden_Dingeo = pd.merge(df, df_geo, how='inner', on='dingeo_link', right_index=False).drop_duplicates()

    return df_Boligsiden_Dingeo

- Functions for scraping hvorlangterder.dk

In [4]:
#### Scrape information for single address from hvorlangterder.dk

def get_hvorlangterder(address):
    try:
        url = 'https://hvorlangterder.poi.viamap.net/v1/nearestpoi/?poitypes' \
              '=daycare,doctor,hospital,junction,metro,school,stop,strain,supermarket,train,library,pharmacy,coast' \
              ',forest,lake,airport,sportshall,publicbath,soccerfield,roadtrain&fromaddress=' + address \
              + '&mot=foot&token=eyJkcGZ4IjogImh2b3JsYW5ndGVyZGVyIiwgInByaXZzIjogInIxWjByMEYwazZCdFdxUWNPVXlrQi95N' \
                'lNVcEp2MlFiZ3lYZXRxNEhZNFhPLzNZclcwK0s5dz09In0.fP4JWis69HmaSg5jVHiK8nemiCu6VaMULSGGJyK4D4PkWq4iA1' \
                '+nSHWMaHxepKwJ83sEiy9nMNZhv7BcktRNrA'
        resp = requests.get(url)
        cont = resp.json()
        df = pd.DataFrame(cont).loc[['routedmeters']]
        df['Location'] = address

        return (df)
    except:
        pass


#### Scrape data from hvorlangterder.dk for all adresses and merge with data from Boligsiden and DinGeo.dk
    
def add_hvorlangterder(df):


    df_hvorlangt = pd.DataFrame()

    for i in tqdm(range(0,len(df))):
        try:
            data = get_hvorlangterder(str(df['Location'][i]))
            df_hvorlangt = pd.concat([df_hvorlangt, data])
        except Exception:
            pass
        time.sleep(0.2)


    merged = pd.merge(df, df_hvorlangt, how='inner', on='Location', right_index=False).drop_duplicates()
    return merged

## Scraping

We scrape data for the period from 01/01/16 until 31/12/21 and the danish municipalities København and Frederiksberg.

First we get the links needed

In [None]:
####### 2017 #######
links = get_all_links_boligsiden('København', '2017-01-01', '2017-12-31')

with open('links_boligsiden_K17.txt', 'w') as file:
    file.write(str(links))
    
links = get_all_links_boligsiden('Frederiksberg', '2017-01-01', '2017-12-31')

with open('links_boligsiden_F17.txt', 'w') as file:
    file.write(str(links))

In [None]:
####### 2018 #######
links = get_all_links_boligsiden('København', '2018-01-01', '2018-12-31')

with open('links_boligsiden_K18.txt', 'w') as file:
    file.write(str(links))
    
links = get_all_links_boligsiden('Frederiksberg', '2018-01-01', '2018-12-31')

with open('links_boligsiden_F18.txt', 'w') as file:
    file.write(str(links))

In [None]:
####### 2019 #######
links = get_all_links_boligsiden('København', '2019-01-01', '2019-12-31')

with open('links_boligsiden_K19.txt', 'w') as file:
    file.write(str(links))
    
links = get_all_links_boligsiden('Frederiksberg', '2019-01-01', '2019-12-31')

with open('links_boligsiden_F19.txt', 'w') as file:
    file.write(str(links))

In [5]:
####### 2020 ########
links = get_all_links_boligsiden('København', '2020-01-01', '2020-12-31')

with open('links_boligsiden_K20.txt', 'w') as file:
    file.write(str(links))
    
links = get_all_links_boligsiden('Frederiksberg', '2020-01-01', '2020-12-31')

with open('links_boligsiden_F20.txt', 'w') as file:
    file.write(str(links))

100%|█████████████████████████████████████████| 298/298 [23:26<00:00,  4.72s/it]
100%|███████████████████████████████████████████| 55/55 [04:07<00:00,  4.49s/it]


In [6]:
######## 2021 ########
links = get_all_links_boligsiden('København', '2021-01-01', '2021-12-31')

with open('links_boligsiden_K21.txt', 'w') as file:
    file.write(str(links))
    
links = get_all_links_boligsiden('Frederiksberg', '2021-01-01', '2021-12-31')

with open('links_boligsiden_F21.txt', 'w') as file:
    file.write(str(links))

100%|█████████████████████████████████████████| 268/268 [21:08<00:00,  4.73s/it]
100%|███████████████████████████████████████████| 47/47 [03:34<00:00,  4.56s/it]


Now we load data from 'Boligsiden.dk'

In [7]:
####### Copenhagen ##########

with open("links_boligsiden_K17.txt", "r") as file:
    links = eval(file.readline())
    
df1, df2 = get_data_boligsiden(links)
df1.to_csv('boligsiden_1_K1.csv', index=False)
df2.to_csv('boligsiden_2_K1.csv', index=False)

with open("links_boligsiden_K18.txt", "r") as file:
    links = eval(file.readline())
    
df1, df2 = get_data_boligsiden(links)
df1.to_csv('boligsiden_1_K2.csv', index=False)
df2.to_csv('boligsiden_2_K2.csv', index=False)

with open("links_boligsiden_K19.txt", "r") as file:
    links = eval(file.readline())

df1, df2 = get_data_boligsiden(links)
df1.to_csv('boligsiden_1_K3.csv', index=False)
df2.to_csv('boligsiden_2_K3.csv', index=False)

with open("links_boligsiden_K20.txt", "r") as file:
    links = eval(file.readline())

df1, df2 = get_data_boligsiden(links)
df1.to_csv('boligsiden_1_K4.csv', index=False)
df2.to_csv('boligsiden_2_K4.csv', index=False)

with open("links_boligsiden_K21.txt", "r") as file:
    links = eval(file.readline())

df1, df2 = get_data_boligsiden(links)
df1.to_csv('boligsiden_1_K5.csv', index=False)
df2.to_csv('boligsiden_2_K5.csv', index=False)

100%|█████████████████████████████████████| 9618/9618 [1:51:51<00:00,  1.43it/s]
100%|█████████████████████████████████████| 8332/8332 [1:37:40<00:00,  1.42it/s]
100%|█████████████████████████████████████| 7752/7752 [1:27:19<00:00,  1.48it/s]
100%|█████████████████████████████████████| 8934/8934 [1:37:30<00:00,  1.53it/s]
100%|█████████████████████████████████████| 8028/8028 [1:23:02<00:00,  1.61it/s]


In [8]:
############# Frederiksberg ####################

with open("links_boligsiden_F17.txt", "r") as file:
    links = eval(file.readline())
    
df1, df2 = get_data_boligsiden(links)
df1.to_csv('boligsiden_1_F1.csv', index=False)
df2.to_csv('boligsiden_2_F1.csv', index=False)

with open("links_boligsiden_F18.txt", "r") as file:
    links = eval(file.readline())
    
df1, df2 = get_data_boligsiden(links)
df1.to_csv('boligsiden_1_F2.csv', index=False)
df2.to_csv('boligsiden_2_F2.csv', index=False)

with open("links_boligsiden_F19.txt", "r") as file:
    links = eval(file.readline())

df1, df2 = get_data_boligsiden(links)
df1.to_csv('boligsiden_1_F3.csv', index=False)
df2.to_csv('boligsiden_2_F3.csv', index=False)

with open("links_boligsiden_F20.txt", "r") as file:
    links = eval(file.readline())

df1, df2 = get_data_boligsiden(links)
df1.to_csv('boligsiden_1_F4.csv', index=False)
df2.to_csv('boligsiden_2_F4.csv', index=False)

with open("links_boligsiden_F21.txt", "r") as file:
    links = eval(file.readline())

df1, df2 = get_data_boligsiden(links)
df1.to_csv('boligsiden_1_F5.csv', index=False)
df2.to_csv('boligsiden_2_F5.csv', index=False)

100%|███████████████████████████████████████| 1672/1672 [15:07<00:00,  1.84it/s]
100%|███████████████████████████████████████| 1477/1477 [13:18<00:00,  1.85it/s]
100%|███████████████████████████████████████| 1501/1501 [13:26<00:00,  1.86it/s]
100%|███████████████████████████████████████| 1621/1621 [14:17<00:00,  1.89it/s]
100%|███████████████████████████████████████| 1390/1390 [12:10<00:00,  1.90it/s]


We load data from DinGeo.dk

In [12]:
############ Copenhagen ##############

#df_Boligsiden1 = pd.read_csv("boligsiden_1_K1.csv")
#df_Boligsiden_Geo1 = get_geolinks1(df_Boligsiden1)
#df_Boligsiden_Dingeo1 = add_dingeo(df_Boligsiden_Geo1)

#df_Boligsiden_Dingeo1.to_csv('boligsiden_dingeo_1_K1.csv', index=False)


#df_Boligsiden1 = pd.read_csv("boligsiden_1_K2.csv")
#df_Boligsiden_Geo1 = get_geolinks1(df_Boligsiden1)
#df_Boligsiden_Dingeo1 = add_dingeo(df_Boligsiden_Geo1)

#df_Boligsiden_Dingeo1.to_csv('boligsiden_dingeo_1_K2.csv', index=False)

#df_Boligsiden1 = pd.read_csv("boligsiden_1_K3.csv")
#df_Boligsiden_Geo1 = get_geolinks1(df_Boligsiden1)
#df_Boligsiden_Dingeo1 = add_dingeo(df_Boligsiden_Geo1)

#df_Boligsiden_Dingeo1.to_csv('boligsiden_dingeo_1_K3.csv', index=False)

df_Boligsiden1 = pd.read_csv("boligsiden_1_K4.csv")
df_Boligsiden_Geo1 = get_geolinks1(df_Boligsiden1)
df_Boligsiden_Dingeo1 = add_dingeo(df_Boligsiden_Geo1)

df_Boligsiden_Dingeo1.to_csv('boligsiden_dingeo_1_K4.csv', index=False)

df_Boligsiden1 = pd.read_csv("boligsiden_1_K5.csv")
df_Boligsiden_Geo1 = get_geolinks1(df_Boligsiden1)
df_Boligsiden_Dingeo1 = add_dingeo(df_Boligsiden_Geo1)

df_Boligsiden_Dingeo1.to_csv('boligsiden_dingeo_1_K5.csv', index=False)

  exec(code_obj, self.user_global_ns, self.user_ns)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['dingeo_link'][x] = url
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['address.street'][x] = df['address.street'].str.split('-').str[0][x] + '--' \
8801it [2:19:27,  1.05it/s]
7954it [1:48:47,  1.22it/s]


In [13]:
########## Frederiksberg #############

df_Boligsiden1 = pd.read_csv("boligsiden_1_F1.csv")
df_Boligsiden_Geo1 = get_geolinks1(df_Boligsiden1)
df_Boligsiden_Dingeo1 = add_dingeo(df_Boligsiden_Geo1)

df_Boligsiden_Dingeo1.to_csv('boligsiden_dingeo_1_F1.csv', index=False)


df_Boligsiden1 = pd.read_csv("boligsiden_1_F2.csv")
df_Boligsiden_Geo1 = get_geolinks1(df_Boligsiden1)
df_Boligsiden_Dingeo1 = add_dingeo(df_Boligsiden_Geo1)

df_Boligsiden_Dingeo1.to_csv('boligsiden_dingeo_1_F2.csv', index=False)

df_Boligsiden1 = pd.read_csv("boligsiden_1_F3.csv")
df_Boligsiden_Geo1 = get_geolinks1(df_Boligsiden1)
df_Boligsiden_Dingeo1 = add_dingeo(df_Boligsiden_Geo1)

df_Boligsiden_Dingeo1.to_csv('boligsiden_dingeo_1_F3.csv', index=False)

df_Boligsiden1 = pd.read_csv("boligsiden_1_F4.csv")
df_Boligsiden_Geo1 = get_geolinks1(df_Boligsiden1)
df_Boligsiden_Dingeo1 = add_dingeo(df_Boligsiden_Geo1)

df_Boligsiden_Dingeo1.to_csv('boligsiden_dingeo_1_F4.csv', index=False)

df_Boligsiden1 = pd.read_csv("boligsiden_1_F5.csv")
df_Boligsiden_Geo1 = get_geolinks1(df_Boligsiden1)
df_Boligsiden_Dingeo1 = add_dingeo(df_Boligsiden_Geo1)

df_Boligsiden_Dingeo1.to_csv('boligsiden_dingeo_1_F5.csv', index=False)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['dingeo_link'][x] = url
1654it [20:17,  1.36it/s]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['address.street'][x] = df['address.street'].str.split('-').str[0][x] + '--' \
1452it [16:41,  1.45it/s]
1485it [16:59,  1.46it/s]
1607it [18:06,  1.48it/s]
1380it [15:37,  1.47it/s]


Finally we get the Hvorlangterder.dk data

In [14]:
############ Copenhagen ##############

geo_bolig1 = pd.read_csv("boligsiden_dingeo_1_K1.csv")
geo_bolig1['Location'] = geo_bolig1['address.street'].str.split(',').str[0] + ', ' \
+ geo_bolig1['address.postalId'].astype(str)
df_Boligsiden_Dingeo_Hvorlangterder1 = add_hvorlangterder(geo_bolig1)
df_Boligsiden_Dingeo_Hvorlangterder1.to_csv('bdh_1_K1.csv', index=False)


geo_bolig1 = pd.read_csv("boligsiden_dingeo_1_K2.csv")
geo_bolig1['Location'] = geo_bolig1['address.street'].str.split(',').str[0] + ', ' \
+ geo_bolig1['address.postalId'].astype(str)
df_Boligsiden_Dingeo_Hvorlangterder1 = add_hvorlangterder(geo_bolig1)
df_Boligsiden_Dingeo_Hvorlangterder1.to_csv('bdh_1_K2.csv', index=False)


geo_bolig1 = pd.read_csv("boligsiden_dingeo_1_K3.csv")
geo_bolig1['Location'] = geo_bolig1['address.street'].str.split(',').str[0] + ', ' \
+ geo_bolig1['address.postalId'].astype(str)
df_Boligsiden_Dingeo_Hvorlangterder1 = add_hvorlangterder(geo_bolig1)
df_Boligsiden_Dingeo_Hvorlangterder1.to_csv('bdh_1_K3.csv', index=False)


geo_bolig1 = pd.read_csv("boligsiden_dingeo_1_K4.csv")
geo_bolig1['Location'] = geo_bolig1['address.street'].str.split(',').str[0] + ', ' \
+ geo_bolig1['address.postalId'].astype(str)
df_Boligsiden_Dingeo_Hvorlangterder1 = add_hvorlangterder(geo_bolig1)
df_Boligsiden_Dingeo_Hvorlangterder1.to_csv('bdh_1_K4.csv', index=False)


geo_bolig1 = pd.read_csv("boligsiden_dingeo_1_K5.csv")
geo_bolig1['Location'] = geo_bolig1['address.street'].str.split(',').str[0] + ', ' \
+ geo_bolig1['address.postalId'].astype(str)
df_Boligsiden_Dingeo_Hvorlangterder1 = add_hvorlangterder(geo_bolig1)
df_Boligsiden_Dingeo_Hvorlangterder1.to_csv('bdh_1_K5.csv', index=False)

  exec(code_obj, self.user_global_ns, self.user_ns)
100%|█████████████████████████████████████| 9405/9405 [2:01:10<00:00,  1.29it/s]
  exec(code_obj, self.user_global_ns, self.user_ns)
100%|█████████████████████████████████████| 8151/8151 [1:54:07<00:00,  1.19it/s]
100%|█████████████████████████████████████| 7615/7615 [1:59:05<00:00,  1.07it/s]
100%|█████████████████████████████████████| 8775/8775 [2:34:34<00:00,  1.06s/it]
100%|█████████████████████████████████████| 7932/7932 [2:12:09<00:00,  1.00it/s]


In [15]:
############### Frederiksberg #################

geo_bolig1 = pd.read_csv("boligsiden_dingeo_1_F1.csv")
geo_bolig1['Location'] = geo_bolig1['address.street'].str.split(',').str[0] + ', ' \
+ geo_bolig1['address.postalId'].astype(str)
df_Boligsiden_Dingeo_Hvorlangterder1 = add_hvorlangterder(geo_bolig1)
df_Boligsiden_Dingeo_Hvorlangterder1.to_csv('bdh_1_F1.csv', index=False)


geo_bolig1 = pd.read_csv("boligsiden_dingeo_1_F2.csv")
geo_bolig1['Location'] = geo_bolig1['address.street'].str.split(',').str[0] + ', ' \
+ geo_bolig1['address.postalId'].astype(str)
df_Boligsiden_Dingeo_Hvorlangterder1 = add_hvorlangterder(geo_bolig1)
df_Boligsiden_Dingeo_Hvorlangterder1.to_csv('bdh_1_F2.csv', index=False)


geo_bolig1 = pd.read_csv("boligsiden_dingeo_1_F3.csv")
geo_bolig1['Location'] = geo_bolig1['address.street'].str.split(',').str[0] + ', ' \
+ geo_bolig1['address.postalId'].astype(str)
df_Boligsiden_Dingeo_Hvorlangterder1 = add_hvorlangterder(geo_bolig1)
df_Boligsiden_Dingeo_Hvorlangterder1.to_csv('bdh_1_F3.csv', index=False)


geo_bolig1 = pd.read_csv("boligsiden_dingeo_1_F4.csv")
geo_bolig1['Location'] = geo_bolig1['address.street'].str.split(',').str[0] + ', ' \
+ geo_bolig1['address.postalId'].astype(str)
df_Boligsiden_Dingeo_Hvorlangterder1 = add_hvorlangterder(geo_bolig1)
df_Boligsiden_Dingeo_Hvorlangterder1.to_csv('bdh_1_F4.csv', index=False)


geo_bolig1 = pd.read_csv("boligsiden_dingeo_1_F5.csv")
geo_bolig1['Location'] = geo_bolig1['address.street'].str.split(',').str[0] + ', ' \
+ geo_bolig1['address.postalId'].astype(str)
df_Boligsiden_Dingeo_Hvorlangterder1 = add_hvorlangterder(geo_bolig1)
df_Boligsiden_Dingeo_Hvorlangterder1.to_csv('bdh_1_F5.csv', index=False)

100%|███████████████████████████████████████| 1640/1640 [39:33<00:00,  1.45s/it]
100%|███████████████████████████████████████| 1449/1449 [19:46<00:00,  1.22it/s]
100%|███████████████████████████████████████| 1479/1479 [14:39<00:00,  1.68it/s]
100%|███████████████████████████████████████| 1602/1602 [14:18<00:00,  1.87it/s]
100%|███████████████████████████████████████| 1376/1376 [10:59<00:00,  2.09it/s]


Now we just need to piece everything together

In [16]:
bdh_1_K1 = pd.read_csv("bdh_1_K1.csv")
bdh_1_K2 = pd.read_csv("bdh_1_K2.csv")
bdh_1_K3 = pd.read_csv("bdh_1_K3.csv")
bdh_1_K4 = pd.read_csv("bdh_1_K4.csv")
bdh_1_K5 = pd.read_csv("bdh_1_K5.csv")

bdh_1_F1 = pd.read_csv("bdh_1_F1.csv")
bdh_1_F2 = pd.read_csv("bdh_1_F2.csv")
bdh_1_F3 = pd.read_csv("bdh_1_F3.csv")
bdh_1_F4 = pd.read_csv("bdh_1_F4.csv")
bdh_1_F5 = pd.read_csv("bdh_1_F5.csv")

raw_data_1 = pd.concat([bdh_1_K1, bdh_1_K2, bdh_1_K3, bdh_1_K4, bdh_1_K5,
                        bdh_1_F1, bdh_1_F2, bdh_1_F3, bdh_1_F4, bdh_1_F5], sort=False)

raw_data_1.to_csv('raw_data_1.csv', index=False)

  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
