In [164]:
import pandas as pd
from bs4 import BeautifulSoup
from requests.exceptions import RequestException
from contextlib import closing
from requests import get
import unicodedata
import re
import json
import logging

In [6]:
def simple_get(url):
    """
    Attempts to get the content at `url` by making an HTTP GET request.
    If the content-type of response is some kind of HTML/XML, return the
    text content, otherwise return None.
    """
    try:
        with closing(get(url, stream=True)) as resp:
            if is_good_response(resp):
                return resp.content
            else:
                return None

    except RequestException as e:
        log_error('Error during requests to {0} : {1}'.format(url, str(e)))
        return None
    
def is_good_response(resp):
    """
    Returns True if the response seems to be HTML, False otherwise.
    """
    content_type = resp.headers['Content-Type'].lower()
    return (resp.status_code == 200 
            and content_type is not None 
            and content_type.find('html') > -1)


def log_error(e):
    """
    It is always a good idea to log errors. 
    This function just prints them, but you can
    make it do anything.
    """
    print(e)

In [150]:
url = r'https://www.hemnet.se/salda/bostader?location_ids%5B%5D=17755&item_types%5B%5D=villa&item_types%5B%5D=radhus&item_types%5B%5D=bostadsratt&rooms_min=2.5&selling_price_min=2000000&selling_price_max=4000000&sold_age=all'

raw_html = simple_get(url = url)
html = BeautifulSoup(raw_html, 'html.parser')

In [90]:
def find_numbers(text):
    
    regexp = re.compile('(\d+)')
    
    results = regexp.findall(text)
    price_str = ''
    for result in results:
        price_str+=result
    return float(price_str)
    

In [181]:
def load_house_data(url):
    
    data = {}
    
    raw_html = simple_get(url = url)
    html_ = BeautifulSoup(raw_html, 'html.parser')
    
    item = html_.find(class_ = 'sold-property__price')
    text = item.find(class_ = 'sold-property__price-value').text
    text_ = unicodedata.normalize('NFKD',text)
    data['price'] = find_numbers(text = text_)
    
    item = html_.find(class_ = 'sold-property__details')
    attribute_names = []
    for attribute in item.find_all(class_ = 'sold-property__attribute'):
        attribute_names.append(attribute.text)

    attributes = {}

    for name,value_item in zip(attribute_names,item.find_all(class_ = 'sold-property__attribute-value')):
        value_str = unicodedata.normalize('NFKD',value_item.text)
        #value = find_numbers(text = value_str)
        attributes[name] = value_str
        
    data.update(attributes)
    
    map_item = html_.find(class_ = 'sold-property__map')
    map_data = json.loads(map_item['data-initial-data'])

    listing = map_data['listing']
    data['id'] = listing['id']
    data['coordinate'] = listing['coordinate']
    data['type'] = listing['type']
    data['address'] = listing['address']
    data['map_url'] = map_data['map_url']
    data['url'] = listing['url']
    data['sale_date'] = listing['sale_date']
    
    data = pd.Series(data)
    data.name = data['id']
    
    return data

In [182]:
house_data = pd.DataFrame()

part1 = r'https://www.hemnet.se/salda/bostader?item_types%5B%5D=villa&item_types%5B%5D=radhus&item_types%5B%5D=bostadsratt&location_ids%5B%5D=17755&page='
part2 = '&rooms_min=2.5&selling_price_max=4000000&selling_price_min=2000000&sold_age=all'
    
ok = True
i = 0
while ok:
    i+=1
    try:
        url = part1 + '%i' % i + part2
        raw_html = simple_get(url = url)
        html = BeautifulSoup(raw_html, 'html.parser')
        
        for item_link_contaier in html.find_all(class_="item-link-container"):
            try:
                data = load_house_data(url = item_link_contaier['href'])
            except:
                logging.exception('Skipping house')
            else:
                house_data = house_data.append(data)
    except:
        ok = False
        logging.exception('Could not find page:%i' % i)

ERROR:root:Skipping house
Traceback (most recent call last):
  File "<ipython-input-182-0ac8724e2624>", line 17, in <module>
    data = load_house_data(url = item_link_contaier['href'])
  File "<ipython-input-181-70ad755be9af>", line 28, in load_house_data
    map_data = json.loads(map_item['data-initial-data'])
TypeError: 'NoneType' object is not subscriptable
ERROR:root:Skipping house
Traceback (most recent call last):
  File "<ipython-input-182-0ac8724e2624>", line 17, in <module>
    data = load_house_data(url = item_link_contaier['href'])
  File "<ipython-input-181-70ad755be9af>", line 28, in load_house_data
    map_data = json.loads(map_item['data-initial-data'])
TypeError: 'NoneType' object is not subscriptable
ERROR:root:Skipping house
Traceback (most recent call last):
  File "<ipython-input-182-0ac8724e2624>", line 17, in <module>
    data = load_house_data(url = item_link_contaier['href'])
  File "<ipython-input-181-70ad755be9af>", line 28, in load_house_data
    map_data = 

In [183]:
house_data.describe()

Unnamed: 0,id,price
count,23491.0,23491.0
mean,528805.952918,2878921.0
std,259546.746889,568206.2
min,313.0,2000000.0
25%,308655.0,2400000.0
50%,548024.0,2800000.0
75%,757900.5,3350000.0
max,939070.0,4000000.0


In [184]:
house_data.to_csv('house_data.csv')

In [185]:
house_data

Unnamed: 0,Antal rum,Avgift/månad,Begärt pris,Boarea,Byggår,Driftskostnad,Förening,Pris per kvadratmeter,Prisutveckling,address,coordinate,id,map_url,price,sale_date,type,url,Biarea,Tomtarea
938778,4 rum,5 604 kr/mån,2 475 000 kr,103 m2,2008,9 600 kr/år,\n Brf Höjdpunkten\n,24 757 kr/m2,\n\n +75 000 kr (+3 %)\n,Björkhagegatan 6,"[57.92291326887233, 12.502651264129536]",938778.0,https://maps.googleapis.com/maps/api/js?librar...,2550000.0,Såld 2019-01-28,bostadsratt,/salda/bostadsratt-4rum-stadsskogen-alingsas-k...,,
938744,7 rum,,1 995 000 kr,175 m2,1958,52 400 kr/år,,13 029 kr/m2,\n\n +285 000 kr (+14 %)\n,Ekebackevägen 1,"[58.47241420896091, 11.669846459571746]",938744.0,https://maps.googleapis.com/maps/api/js?librar...,2280000.0,Såld 2019-01-28,villa,/salda/villa-7rum-lilla-foss-munkedals-kommun-...,145 m2,1 483 m2
938717,5 rum,,3 495 000 kr,103 m2,1979,38 865 kr/år,,32 330 kr/m2,\n\n -165 000 kr (-5 %)\n,Agnebäcksvägen 24,"[57.69013704556706, 12.208904027993107]",938717.0,https://maps.googleapis.com/maps/api/js?librar...,3330000.0,Såld 2019-01-28,radhus,/salda/radhus-5rum-landvetter-harryda-kommun-a...,,201 m2
938962,5 rum,,2 275 000 kr,137 m2,1980,36 000 kr/år,,16 606 kr/m2,\n,Tvåöringsgränd 12,"[58.377188776295206, 11.952471921867492]",938962.0,https://maps.googleapis.com/maps/api/js?librar...,2275000.0,Såld 2019-01-28,villa,/salda/villa-5rum-helenedal-uddevalla-kommun-t...,21 m2,346 m2
938923,3 rum,4 492 kr/mån,2 650 000 kr,70 m2,2016,,\n BRF Lergöken\n,40 000 kr/m2,\n\n +150 000 kr (+6 %)\n,Lergöksgatan 15,"[57.65738875448929, 11.9003493296695]",938923.0,https://maps.googleapis.com/maps/api/js?librar...,2800000.0,Såld 2019-01-28,bostadsratt,/salda/bostadsratt-3rum-frolunda-goteborgs-kom...,,
938868,5 rum,5 640 kr/mån,2 995 000 kr,92 m2,1967,5 400 kr/år,\n Brf Smörhålan i Mölndal\n,32 880 kr/m2,\n\n +30 000 kr (+1 %)\n,Irisgatan 4A,"[57.650282811001254, 12.015895880169424]",938868.0,https://maps.googleapis.com/maps/api/js?librar...,3025000.0,Såld 2019-01-28,bostadsratt,/salda/bostadsratt-5rum-molndal-broslatt-molnd...,,
938867,3 rum,3 605 kr/mån,3 650 000 kr,81 m2,2012,6 600 kr/år,\n Brf Österport Vänortsgatan\n ...,47 840 kr/m2,\n\n +225 000 kr (+6 %)\n,Vänortsgatan 30,"[57.66513209073759, 12.010633588056578]",938867.0,https://maps.googleapis.com/maps/api/js?librar...,3875000.0,Såld 2019-01-28,bostadsratt,/salda/bostadsratt-3rum-molndal-bosgarden-moln...,,
938684,3 rum,3 843 kr/mån,2 250 000 kr,66 m2,1954,,,35 606 kr/m2,\n\n +100 000 kr (+4 %)\n,Saxofongatan 14,"[57.66020270329029, 11.919126090072947]",938684.0,https://maps.googleapis.com/maps/api/js?librar...,2350000.0,Såld 2019-01-28,bostadsratt,/salda/bostadsratt-3rum-ruddalen-goteborgs-kom...,,
938587,5 rum,,3 100 000 kr,122 m2,1997,26 577 kr/år,,29 508 kr/m2,\n\n +500 000 kr (+16 %)\n,Hökvägen 23,"[58.36369384831502, 13.816872969524614]",938587.0,https://maps.googleapis.com/maps/api/js?librar...,3600000.0,Såld 2019-01-28,villa,/salda/villa-5rum-hentorp-skovde-kommun-hokvag...,10 m2,516 m2
938584,4 rum,7 949 kr/mån,3 750 000 kr,"93,5 m2",1986,4 200 kr/år,\n Tornhuset\n,37 433 kr/m2,\n\n -250 000 kr (-7 %)\n,"Tritongatan 4, 55 +","[57.7006060966225, 11.992784890508391]",938584.0,https://maps.googleapis.com/maps/api/js?librar...,3500000.0,Såld 2019-01-28,bostadsratt,/salda/bostadsratt-4rum-garda-goteborgs-kommun...,,
