<h1>Import frameworks</h1>

In [188]:
import numpy as np
import pandas as pd
import glob
import errno
import re
import locale
import datetime
from matplotlib import pyplot
from bs4 import BeautifulSoup
locale.setlocale(locale.LC_TIME, "sv_SE") # For Swedish dates

'sv_SE'

<h1>Cleaning methods</h1>

In [191]:
date_obj = lambda dateText: datetime.datetime.strptime(dateText.replace('Såld ','').strip(), '%d %B %Y')

def cleanLocation(locationText):
    locationText.span.decompose()        
    stripped = locationText.text.strip().replace("\n", "")
    splitted = stripped.split(',')
    locationList = list(map(lambda x: x.strip(), splitted))
    return ", ".join(locationList)

def areaAndRoom(areaText):
    areaText.span.decompose() if areaText.span else areaText
    areaAndRoom = re.findall(r'\d+', areaText.text.strip())   
    areaAndRoomList = list(map(lambda x: x.strip(), areaAndRoom))
    intList = [eval(i) for i in areaAndRoomList]
    area = 0
    room = 0
    errors = 0
    try:
        area = intList[0]
        room = intList[1]
    except IndexError:
        errors += 1
    #print('Errors ' + errors.__str__())
    return area, room

def cleanLandArea(landAreaText):
    landAreaText = landAreaText.replace('\u00a0','')
    return zeroIfNoNumber(landAreaText)

def cleanPrice(priceText):
    priceText = priceText.replace('Slutpris','')
    priceText = priceText.replace('kr','')
    priceText = priceText.replace('\u00a0','')
    return zeroIfNoNumber(priceText)

def zeroIfNoNumber(valueText):
    value = re.findall(r'\d+', valueText)
    if value.__len__() > 0:
        value = int(value[0])
    else:
        value = 0
    return value

<h1>Parse Entity</h1>

In [192]:
def parseObject(obj):
        dateText = obj.find('span',attrs={'class':'hcl-label hcl-label--state hcl-label--sold-at'}).text
        addressText = obj.find('h2',attrs={'class':'sold-property-listing__heading qa-selling-price-title hcl-card__title'}).text
        locationText = obj.find('span',attrs={'class':'property-icon property-icon--result'}).parent
        areaText = obj.find('div',attrs={'class':'sold-property-listing__subheading sold-property-listing__area'})
        extraAreaText = obj.find('span',attrs={'class':'listing-card__attribute--normal-weight'}).text if obj.find('span',attrs={'class':'listing-card__attribute--normal-weight'}) else ''
        landAreaText = obj.find('div',attrs={'class':'sold-property-listing__land-area'}).text if obj.find('div',attrs={'class':'sold-property-listing__land-area'}) else ''
        priceText = obj.find('span',attrs={'class':'hcl-text hcl-text--medium'}).text
        area, room = areaAndRoom(areaText)
        return [date_obj(dateText), addressText.strip(), cleanLocation(locationText), area, zeroIfNoNumber(extraAreaText), room, cleanLandArea(landAreaText), cleanPrice(priceText)]


<h1> Parse files</h1>

In [193]:
dir_path = '../kungalv_slutpriser/*.html' 
files = glob.glob(dir_path)
entities = pd.DataFrame(columns=['Date','Address','Location','Area','ExtraArea','Room', 'LandArea','Price'])
for name in files:
    try:
        with open(name) as f:
            soup = BeautifulSoup(f, "html.parser")
            objects = soup.findAll('li',attrs={'class':'sold-results__normal-hit'})
            for obj in objects:
                entity = parseObject(obj)
                entities.loc[len(entities.index)] = entity 
    except IOError as exc:
        if exc.errno != errno.EISDIR:
            raise

print(entities.head())

        Date              Address                        Location  Area  \
0 2017-11-23          Sjöhåla 580     Kovikshamn, Kungälvs kommun    94   
1 2017-11-18       Galeasgatan 15        Kungälv, Kungälvs kommun   103   
2 2017-11-17  Västerhöjdsvägen 36          Kärna, Kungälvs kommun   107   
3 2017-11-16     Gråstensvägen 19  Kode Halltorp, Kungälvs kommun    94   
4 2017-11-16           Hägnan 135         KAREBY, Kungälvs kommun   235   

   ExtraArea  Room  LandArea    Price  
0         87     5      1068  3100000  
1         64     5       610  3850000  
2          0     5       258  4000000  
3          0     5      1197  3200000  
4          0     6    104335  8800000  


In [196]:
entities.to_csv('entities.csv', index=False, encoding='utf-8')