Welcome!

In [1]:
from bs4 import BeautifulSoup, SoupStrainer
import urllib
import re
import pandas as pd
from collections import defaultdict
import numpy as np
from selenium.webdriver.common.keys import Keys

# Try 2 - 
# https://stackoverflow.com/questions/29404856/how-can-i-render-javascript-html-to-html-in-python 
# https://stackoverflow.com/questions/29858752/error-message-chromedriver-executable-needs-to-be-available-in-the-path

from selenium import webdriver
from urllib.request import urlopen
from webdriver_manager.chrome import ChromeDriverManager
from webdriver_manager.firefox import GeckoDriverManager

In [2]:
def read_adress_data(pickle_path='data/adresses/copenhagen_adresses.pickle', raw_path='http://dawa.aws.dk/adresser?format=csv&kommunekode=0101'):
    try: 
        print('Loading from pickle')
        df_adress = pd.read_pickle(pickle_path)
    except:
        print(f'Loading from pickle failed. Loading from source ({raw_path}) instead')
        address_path = raw_path     #address_path = 'http://dawa.aws.dk/adresser?format=json&kommunekode=0101'
        with urllib.request.urlopen(address_path) as f:
            data_binary = f.readlines() #.decode()
        data = [re.sub(r'(?!(([^"]*"){2})*[^"]*$),', '', x.decode().strip()).split(',') for x in data_binary] # https://stackoverflow.com/questions/38336518/remove-all-commas-between-quotes
        df_adress = pd.DataFrame(data[1:], columns=data[0])
        df.to_pickle(pickle_path)
    return df_adress

In [20]:
#df_adress = read_adress_data()

### Fetch boligsalgsdata!

In [54]:
def get_bolig_list(municipality_code='101', pages=3):
    
    bolig_liste = defaultdict(str)

    for page in range(1, pages+1):
        url_in_scope = f'https://www.boliga.dk/resultat?sort=zipCode-a&municipality={municipality_code}&page={page}'
        print('Scraping', url_in_scope)
        try:
            page = urllib.request.urlopen(url_in_scope)
            #soup = BeautifulSoup(page, 'html.parser')
            #print(soup)

            for link in BeautifulSoup(page, 'html.parser', parse_only=SoupStrainer('a')):

                if link.has_attr('href'):
                    #print(link['href'])
                    if re.match('/bolig/\d*/.*', link['href']):
                        bolig_liste[link['href']] = defaultdict(str)


        except Exception as e:
                print("An error occured.", e)
    
    return bolig_liste

def get_html_with_js(url, quit_browser=True, close_tabs=False, browser=None):
    """
    Description:
        Get html based on an url with javascript data compiled
    Input:
        url (str): Url to load and js compiled
    Returns:
        str: html including compiled js content
    """
    file_name = 'C:/working/ngs-lib/Boligsalg/data/url_data/data.txt'
    
    # Store HTML locally 
    conn = urlopen(url)
    data = conn.read()
    conn.close()
    file = open(file_name,'wb')
    file.write(data)
    file.close()

    # Compile javascript to get complete complete html
    if browser is None:
        browser = webdriver.Chrome(ChromeDriverManager().install()) #browser = webdriver.Firefox(executable_path=GeckoDriverManager().install())
    else:
        pass
    browser.get('file:///'+file_name) #browser.get(url)
    html = browser.page_source
    if quit_browser:
        browser.quit()
    if close_tabs:
        browser.close() # close_tabs()
    
    return html 

def initialize_lookout_dict():
    """
    Description: 
        Set's up the definitions to look for in a html file. 

        * start_key: pattern t look for, for start of key ordered 
        * start_key: pattern to look for, for start of key ordered.
        * start_value: pattern to look for, for start of value ordered.
        * end_value: pattern to look for, for end of value ordered.
        * name: Predefined name. If false then start_key/end_key needs to be specified
        * list: False results expected to be only on for each key. True if multiple values for each key.
        * limit: limit of iterations to look for the start_value/start_key/end_value/end_key patterns. 
    Returns:
        dict: Dictionary desscribing which patterns to look for 
    
    """
    lookout_dict = {}
    lookout_dict['Adresse'] = {'start_value': ['<meta name="keywords" content="">', '<title>'],
                              'end_value': ['</title>', ' - '],
                              'name': 'Adresse',
                              'list': False,
                              'limit': 1}

    lookout_dict['Basisoplysninger'] = {'start_key': ['<span _ngcontent-sc','="" class="mb-1 d-md-none font-weight-bold w-100">'],
                                  'end_key': ['</span>'],
                                  'start_value': ['<span _ngcontent-sc','="" class="d-md-none my-auto">'],
                                  'end_value': ['</span>'],
                                  'name': None, # missing, so will need to find it in data!
                                  'list': False,
                                  'limit': 9}

    lookout_dict['Pris'] = {'start_value': ['<span class="h-md-2 h4 font-weight-bold m-0 text-nowrap">'],
                                  'end_value': ['</span>'],
                                  'name': 'Pris', # missing, so will need to find it in data!
                                  'list': False,
                                  'limit': 1}

    lookout_dict['Timeline'] = {'start_value': ['class="timeline">', '<span _ngcontent-sc','="" class="d-sm-none d-table-cell timeline">'],
                                  'end_value': ['</span>'],
                                  'name': 'Timeline_time', # missing, so will need to find it in data!
                                  'list': True,
                                  'limit': 10}

    lookout_dict['Timeline_action'] = {'start_value': ['class="timeline">', '<div _ngcontent-sc','="" class="mr-0 mr-sm-2"><!---->'],
                                  'end_value': ['<!---->'],
                                  'name': 'Timeline_action', # missing, so will need to find it in data!
                                  'list': True,
                                  'limit': 10}

    lookout_dict['Timeline_price'] = {'start_value': ['class="timeline">', 'd-sm-none d-table-cell month', '<span _ngcontent-sc','="">'],
                                  'end_value': ['<'],
                                  'name': 'Timeline_price', # missing, so will need to find it in data!
                                  'list': True,
                                  'limit': 10}

    lookout_dict['Boligtype'] = { 'start_value': ['<app-tooltip _ngcontent-sc','=""',' class="md-right flex-shrink-0" _nghost-sc','=""><!---->','<p _ngcontent-sc','="" class="app-tooltip ng-star-inserted"><!----><!---->'],
                                  'end_value': ['<!---->'],
                                  'name': 'Boligtype', # missing, so will need to find it in data!
                                  'list': False,
                                  'limit': 1}

    lookout_dict['Oprettet'] = { 'start_value': ['BBR', 'BBR-arealet opmåles fra ydersiden af hver ydervæg og indeholder arealer som fx opgang.', '<app-tooltip class="tooltip-nowrap" _nghost-sc','="">', '<!----><p _ngcontent-sc','=""', ' class="app-tooltip ng-star-inserted"><!----><!---->', '<p _ngcontent-sc','="" class="app-tooltip ng-star-inserted"><!----><!---->'],
                                  'end_value': ['<!---->'],
                                  'name': 'Oprettet', # missing, so will need to find it in data!
                                  'list': False,
                                  'limit': 1}

    lookout_dict['Markedstid'] = {'start_value': ['<use xlink:href="#icon-calendar"></use></svg><span class="text-primary h5 h-md-4 m-0">'],
                                  'end_value': ['</span>'],
                                  'name': 'Markedstid', # missing, so will need to find it in data!
                                  'list': False,
                                  'limit': 1}

    lookout_dict['Link'] = {'start_value': ['<a class="btn btn-primary col-12 w-100 font-weight-bolder px-1 ng-star-inserted" data-gtm="see_property_at_agency_btn" target="_blank" href="'],
                                  'end_value': ['"'],
                                  'name': 'Link', # missing, so will need to find it in data!
                                  'list': False,
                                  'limit': 1}

    
    lookout_dict['Link_alt'] = {'start_value': ['btn btn-primary col-12 w-100 font-weight-bolder px-1 ng-star-inserted', 'href="'],
                                  'end_value': ['"'],
                                  'name': 'Link_alt', # missing, so will need to find it in data!
                                  'list': False,
                                  'limit': 1}

    
    #
    
    # If Failed on basisoplysninger, we can try to add them manually: 
    elements = ['Boligstørrelse', 'Grundstørrelse', 'Byggeår', 
                    'Energimærke', 'Ejerudgift', 'Kælderstørrelse',
                    'Boligareal, tinglyst']
    for element in elements:
        lookout_dict[element + '_alt'] = {'start_value': [element, '<span _ngcontent-sc41="" class="d-md-none my-auto">'],
                                      'end_value': ['<!---->'],
                                      'name': element + '_alt', # missing, so will need to find it in data!
                                      'list': False,
                                      'limit': 1} 
        
    for element in elements:
        lookout_dict[element + '_alt2'] = {'start_value': [element, '<span _ngcontent-sc41="" class="d-md-none my-auto">'],
                                      'end_value': ['</span>'],
                                      'name': element + '_alt', # missing, so will need to find it in data!
                                      'list': False,
                                      'limit': 1} 
    
    # 4 værelse, 5 sal, link, salgshistorik
#     lookout_dict['Boligstørrelse_alt'] = {'start_value': ['Grundstørrelse:'],
#                                   'end_value': ['<!---->'],
#                                   'name': 'Grundstørrelse_alt', # missing, so will need to find it in data!
#                                   'list': False,
#                                   'limit': 1}


    return lookout_dict

def get_start_end(lookout_starts, lookout_ends, content, start=0, end=10e6):
    """
    Description:
        Get start and end of the lookout patterns.
        Used as a helper function to "fetch data"
        
        Iteratively looks over the "lookout_starts" to determine the final starting position. 
        I.e. if lookout_starts has one than one element, it will start by finding the first 
        element, then it will continue to find the second element the first time it appears 
        after the first one, and so forth with the third element the first time it appears 
        after the second. Sample principle holds for the "lookout_ends"
    Input:  
        lookout_starts (list): List of element to finde iteratively. 
        lookout_ends (list): List of element to find iteratively (startin)
        content (str): Content to be searched
    Returns:
        int: start. Starting position based on lookout_starts
        int: end. Ending position based on lookout_ends
    """
    
    for lookout_start in lookout_starts:
        start = content.find(lookout_start, start+1)
    for e, lookout_end in enumerate(lookout_ends):
        if e == 0:
            end = content.find(lookout_end, start+len(lookout_starts[-1]))
        else: 
            end = content.rfind(lookout_end, start+len(lookout_starts[-1]), end)

    return start, end 


def fetch_data(soup, lookout_dict, threshold_key_len=200):
    """
    Fetches data based on lookout dict from soup
    
    Input: 
        soup (bs4 soup):
        lookout_dict (dict): As defined and described in function "initialize_lookout_dict".
        threshold_key_len (int): To make sure gibberish are not stored!
    Returns:
        dict: Dictionary containing ordered information from the soup based on the lookout patterns in lookout_dict
    """
    bolig_dict = {}
    bolig_dict_temp = {}
    
    contents = [str(x) for x in soup.find('pre')]
    for content in contents: 
        host_identifier = '[_nghost-'
        host_start = content.find(host_identifier)
        host_end = content.find(']', host_start)
        #sc_host = content[host_start+len(host_identifier):host_end]
        
        #sc_ids = [int(re.search('\d\d*', y).group()) for y in re.findall('sc\d\d*', content)]
        #sc_ids = list(set(sc_ids))
        #sc_min, sc_max = min(sc_ids), max(sc_ids)
        #print('sc min max', sc_min, sc_max)
        #print('number of sc', len(sc_ids), sc_ids)
        #for sc_id, sc_host_int in enumerate(sc_ids): #range(sc_min,sc_max):
        #    sc_host = str(sc_host_int)
        #print('this is the host:', sc_host, type)
        sc_host = 'doesnotmatteranymore'
        for k in lookout_dict:
            #print('Lookout patterns', k,  lookout_dict[k])
            start = 0
            end = len(content)
            counter = 0
            while True: # I

                if 'name' in lookout_dict[k]: 
                    key_in_scope = lookout_dict[k]['name']

                if ('start_key' and 'end_key') in lookout_dict[k]: 
                    lookout_starts = [re.sub('sc\d\d*', 'sc'+sc_host, x) for x in lookout_dict[k]['start_key']] #['<meta name="keywords" content="">', '<title>']
                    lookout_ends = [re.sub('sc\d\d*', 'sc'+sc_host, x) for x in  lookout_dict[k]['end_key']]
                    start, end = get_start_end(lookout_starts, lookout_ends, content, start=start, end=end)
                    key_in_scope = content[start+len(lookout_starts[-1]):end].strip() # finding key from context

                if ('start_value' and 'end_value') in lookout_dict[k]: 
                    lookout_starts = [re.sub('sc\d\d*', 'sc'+sc_host, x) for x in lookout_dict[k]['start_value']] #['<meta name="keywords" content="">', '<title>']
                    lookout_ends = [re.sub('sc\d\d*', 'sc'+sc_host, x) for x in lookout_dict[k]['end_value']]
                    start, end = get_start_end(lookout_starts, lookout_ends, content, start=start, end=end)

                counter += 1
                if start != -1:
                    #print('start', start, end)
                    if lookout_dict[k]['list']:
                        if key_in_scope in bolig_dict_temp:
                            bolig_dict_temp[key_in_scope].append(content[start+len(lookout_starts[-1]):end].strip())
                        else:
                            bolig_dict_temp[key_in_scope] = [content[start+len(lookout_starts[-1]):end].strip()]
                    else:
                        id_counter = 2
                        if key_in_scope in bolig_dict_temp:
                            bolig_dict_temp[key_in_scope + str(id_counter)] = content[start+len(lookout_starts[-1]):end].strip()
                        else:
                            bolig_dict_temp[key_in_scope] = content[start+len(lookout_starts[-1]):end].strip()
                        id_counter += 1
                else:
                    break
                if counter >= lookout_dict[k]['limit']:
                    break

        # cleanup
        for k in bolig_dict_temp: 
            if len(k) < threshold_key_len:
                bolig_dict[k] = bolig_dict_temp[k]
        #bolig_dict_temp
        return bolig_dict


In [55]:
def main():
    bolig_data = []
    #bolig_liste = get_bolig_list(municipality_code='101', pages=1)
    #bolig_urls = [f'https://www.boliga.dk{bolig}' for bolig in bolig_liste]
    
    bolig_urls = ['https://www.boliga.dk/bolig/1605331/tordenskjoldsgade_27_5_th_1055_koebenhavn_k',
                  'https://www.boliga.dk/bolig/1597985/ingrid_marievej_50__4_tv_2500_valby',
                  'https://www.boliga.dk/bolig/1603678/kongens_nytorv_21_3_tv_1050_koebenhavn_k']
    print(bolig_urls[0:3])
    #bolig_url = bolig_urls[1] # 
    for bolig_url in bolig_urls[0:3]:
        #bolig_url = 'https://www.boliga.dk/bolig/1644897/ingrid_marievej_66__12_4_2500_valby'
        browser = webdriver.Chrome(ChromeDriverManager().install())
        html = get_html_with_js(bolig_url, quit_browser=True, close_tabs=False, browser=browser)
        soup = BeautifulSoup(html, "html.parser")
        lookout_dict = initialize_lookout_dict()
        bolig_d = fetch_data(soup, lookout_dict, threshold_key_len=200)
        bolig_data.append(bolig_d)
    
    return bolig_data
        
if __name__ == '__main__':
    bolig_data = main()
    

['https://www.boliga.dk/bolig/1605331/tordenskjoldsgade_27_5_th_1055_koebenhavn_k', 'https://www.boliga.dk/bolig/1597985/ingrid_marievej_50__4_tv_2500_valby', 'https://www.boliga.dk/bolig/1603678/kongens_nytorv_21_3_tv_1050_koebenhavn_k']

Looking for [chromedriver 80.0.3987.106 win32] driver in cache 
File found in cache by path [C:\Users\theone\.wdm\drivers\chromedriver\80.0.3987.106\win32\chromedriver.exe]

Looking for [chromedriver 80.0.3987.106 win32] driver in cache 
File found in cache by path [C:\Users\theone\.wdm\drivers\chromedriver\80.0.3987.106\win32\chromedriver.exe]

Looking for [chromedriver 80.0.3987.106 win32] driver in cache 
File found in cache by path [C:\Users\theone\.wdm\drivers\chromedriver\80.0.3987.106\win32\chromedriver.exe]


In [56]:
for data in bolig_data:
    for k in data:
        if isinstance(data[k], list):
            print(k)
            for l in data[k]:
                print(' '*4, l)
        else:
            print(k, data[k])
    print('')
    print('-'*50)
    print('')


Adresse Tordenskjoldsgade 27, 5. th., 1055 København K
Boligstørrelse 185 m²
Grundstørrelse 0 m²
Værelser 4
Etage 5. sal
Byggeår 1880
Energimærke D
Ejerudgift 6.454&nbsp; kr. / md.
Kælderstørrelse 0 m²
Boligareal, tinglyst 171 m²
Pris 10.995.000&nbsp;kr.
Timeline_time
     feb. 2020
     okt. 2019
     jun. 2009
     apr. 2008
     nov. 2005
     sep. 2004
     maj 2004
     apr. 2004
     feb. 2020
     okt. 2019
Timeline_action
     Prisfald:  -355.000&nbsp;kr.
     Boligen blev sat til salg
     Solgt, alm. frit salg
     <a _ngcontent-sc23="" href="/bolig/229440/tordenskjoldsgade_27_5_th_1055_koebenhavn_k" class="ng-star-inserted"> Boligen blev sat til salg
     Solgt, alm. frit salg
     Solgt, alm. frit salg
     Solgt, ukendt
     Solgt, ukendt
     Prisfald:  -355.000&nbsp;kr.
     Boligen blev sat til salg
Timeline_price
     10.995.000&nbsp;kr.
     11.350.000&nbsp;kr.
     5.000.000&nbsp;kr.
     5.689.000&nbsp;kr.
     5.100.000&nbsp;kr.
     4.400.000&nbsp;kr.
     0&nbsp;

In [240]:
# look_for_list = ['Marievej',
#                      '4.672',
#                      '4.945.000',
#                      #'250.000',
#                      #'43.377',
#                      'mar. 2020',
#                      'Boligen blev sat til salg',
#                      'A20',
#                      '114',
#                      '2500 Valby',
#                      '6 dage på markedet',
#                      'Se bolig hos mægler',
#                      'danbolig',
#                      '4 værelser']

# type(html)
# soup = BeautifulSoup(html, "html.parser")
# tags = {tag.name: 1 for tag in soup.find_all()}

# for t in list(tags)[3:]:
#     print('tag', t)
#     tag_contents = [str(tag_v) for tag_v in soup.find(t)]
#     for j, tag_v in enumerate(tag_contents):
#         print(j, len(tag_v))
        
#         for lookout in look_for_list:
#             if lookout in tag_v:
#                 print(' '*5, 'Interesting, found', lookout, 'in', t)

Manual code to determine where the relevant stuff is!

In [41]:
bolig_url = 'https://www.boliga.dk/bolig/1605331/tordenskjoldsgade_27_5_th_1055_koebenhavn_k'
#bolig_url = 'https://www.boliga.dk/bolig/1597985/ingrid_marievej_50__4_tv_2500_valby'
#bolig_url = 'https://www.boliga.dk/bolig/1603678/kongens_nytorv_21_3_tv_1050_koebenhavn_k'

browser = webdriver.Chrome(ChromeDriverManager().install())
html = get_html_with_js(bolig_url, quit_browser=True, close_tabs=False, browser=browser)
soup = BeautifulSoup(html, "html.parser")


Looking for [chromedriver 80.0.3987.106 win32] driver in cache 
File found in cache by path [C:\Users\theone\.wdm\drivers\chromedriver\80.0.3987.106\win32\chromedriver.exe]


In [26]:
#look_for_list = ['185', '4 værelser', '5. sal', '171', 'okt. 2019', 'Boligen blev sat til salg', '11.350.000']
look_for_list = ['97' ,'Boligstørrelse', '3 værelser', '4. sal', '2019', 'A20', '4.243  kr. / md.', 'sep. 2019', 'Boligen blev sat til salg']


In [50]:
i = 1
print('in scope', look_for_list[i])

list_in_scope = look_for_list[i:i+1]
list_in_scope = ['price-movement-col']

#lookout = look_for_list[0]
contents = [str(x) for x in soup.find('pre')]
for content in contents: 
    for lookout in list_in_scope:
        start = 0
        end = np.inf
        counter = 0
        while True:
            counter += 1
            start = content.find(lookout, start)
            #place = content.find(lookout, start)
            print(lookout, start) 
            print(content[start-100:start+850])
            print('')
            if start == -1:
                break
            if counter > 10:
                break
            start = start+1
#     if lookout in content:
#         print('fun') # <span _ngcontent-sc79="" class="d-md-none my-auto">

in scope Boligstørrelse
price-movement-col 43890
#5A626B;
}
.timeline[_ngcontent-sc23] {
  color: #5A626B;
  height: 24px;
  white-space: nowrap;
}
.price-movement-col[_ngcontent-sc23]   div[_ngcontent-sc23] {
  text-align: center;
}
.price-movement-col[_ngcontent-sc23]   div[_ngcontent-sc23]   svg[_ngcontent-sc23] {
  margin-left: auto;
  margin-right: auto;
}
.badge-sold[_ngcontent-sc23] {
  font-size: 12px;
  line-height: 15px;
  padding: 5px 9px;
}
.clear-btn[_ngcontent-sc23] {
  padding: 0px;
  border: 0px;
  color: #5A626B;
  font-weight: 500;
}</style><style ng-transition="boliga-app">@charset "UTF-8";
.card[_ngcontent-sc24] {
  border: 1px solid rgba(0, 0, 0, 0.03);
  box-shadow: 0 7px 10px rgba(0, 0, 0, 0.05);
  border-radius: 6px;
  background: #ffffff;
}
.card[_ngcontent-sc24]   .card-header[_ngcontent-sc24], .card[_ngcontent-sc24]   .card-footer[_ngcontent-sc24] {
  padding-top: 19px;
  padding-bottom: 19px;
  background: #ffffff;
}
@media (max-width: 767.98px) {
  .card[_


In [104]:
# tag = soup.find('pre')
# sub_soup = BeautifulSoup(str(tag), "html.parser")
# sub_tags = {tag.name: 1 for tag in sub_soup.find_all()}
# sub_tags
# #type(html), type(soup), type(str(tag)), sub_soup

In [105]:
#str(sub_soup)

In [106]:
#subsub_soup = BeautifulSoup(sub_soup, "html.parser")

In [6]:
# for i, bolig in enumerate(bolig_liste.keys()):
#     if i == 0:
#         bolig_url = f'https://www.boliga.dk{bolig}'
#         print(bolig_url)
        
#         try:
#             page = urllib.request.urlopen(url_in_scope)
#             soup = BeautifulSoup(page, 'html.parser')
#             #print(soup)
#             #body = soup.find('body')
#             #body_no_tags = body.findChildren(recursive=False)
#             #print(body_no_tags)
#             tags = {tag.name: 1 for tag in soup.find_all()}
# #            [str(tag) for tag in soup.find_all()]
#         except Exception as e:
#             print("An error occured.", e)

https://www.boliga.dk/bolig/1616756/tyboroen_alle_91__st_2720_vanloese


In [7]:
# [str(tag_v) for tag_v in soup.find('html')][2]

# look_for_list = ['Marievej', '4.940', '4.945.000', '33.291', 'mar. 2020', 'Boligen blev sat til salg', 
#                  '120', '2500 Valby', '2500 Valby','7 dage på markedet', 'Se bolig hos mægler', 'danbolig']

# for t in list(tags):
#     #print('tag', t)
#     tag_values = [str(tag_v) for tag_v in soup.find(t)]
#     for j, tag_v in enumerate(tag_values):
#         #print(j)
#         #if j < 2:
#         for lookout in look_for_list:
#             if lookout in tag_v:
#                 print(' '*5, 'Interesting, found', lookout, 'in', t)
# #         if tag_v in
# #             print(' '*5, j, tag_v)
    

      Interesting, found Marievej in html
      Interesting, found 33.291 in html
      Interesting, found mar. 2020 in html
      Interesting, found Boligen blev sat til salg in html
      Interesting, found 120 in html
      Interesting, found 2500 Valby in html
      Interesting, found 2500 Valby in html
      Interesting, found Se bolig hos mægler in html
      Interesting, found Marievej in body
      Interesting, found 33.291 in body
      Interesting, found mar. 2020 in body
      Interesting, found 120 in body
      Interesting, found 2500 Valby in body
      Interesting, found 2500 Valby in body
      Interesting, found Marievej in body
      Interesting, found Boligen blev sat til salg in body
      Interesting, found 120 in body
      Interesting, found Se bolig hos mægler in body
      Interesting, found Marievej in app-root
      Interesting, found 33.291 in app-root
      Interesting, found mar. 2020 in app-root
      Interesting, found 120 in app-root
      Interesting, 

Some information seems to be missing. Maybe it's because some javascript is not yet calculated.


Looking for [chromedriver 80.0.3987.106 win32] driver in cache 
File found in cache by path [C:\Users\theone\.wdm\drivers\chromedriver\80.0.3987.106\win32\chromedriver.exe]


In [52]:


for interest in look_for_list:
    #print(interest)
    if interest in html:
        print(interest, 'is apparant')
    else:
        print(interest, 'not found :()')

#html

Marievej is apparant
4.672 is apparant
4.945.000 is apparant
250.000 is apparant
43.377 is apparant
mar. 2020 is apparant
Boligen blev sat til salg is apparant
A20 is apparant
114 is apparant
2500 Valby is apparant
6 dage på markedet is apparant
Se bolig hos mægler is apparant
danbolig is apparant


In [None]:
# TRY 1: Fialing.. ..

#soup = BeautifulSoup(page, 'html.parser')
#print(soup)



#https://stackoverflow.com/questions/8049520/web-scraping-javascript-page-with-python
from requests_html import HTMLSession, AsyncHTMLSession

# session = HTMLSession()
# r = session.get(bolig_url)
# r.html.render()


import asyncio
if asyncio.get_event_loop().is_running(): # Only patch if needed (i.e. running in Notebook, Spyder, etc)
    import nest_asyncio
    nest_asyncio.apply()


session = AsyncHTMLSession()
r = await session.get(bolig_url)
#await r.html.arender()
#resp=r.html.raw_html
#print(resp)