In [1]:
import requests
from bs4 import BeautifulSoup as BS
from unidecode import unidecode

# Finn.no house and apartment scraper

# Validators

In [2]:
def vali_only_one_price(soup_item):
    '''
    Check if its only one or more prices.
    If its two prices it means its apartment complex that is not done yet
    '''
    check = 0
    for item in soup_item.find_all('span',class_='u-t3'):
        if 'kr' in unidecode(item.get_text()):
            check= check + 1
    if check <= 1:
        return True
    else:
        return False

def vali_price_info(item):
    '''
    Checks if an item is inside the price information we are looking for.
    Returns True if is. Else it returns False.
    '''
    price_info = ['Prisantydning','Fellesgjeld','Omkostninger','Totalpris','Felleskost/mnd.']
    if item in price_info:
        return True
    else:
        return False

def vali_house_info(item):
    '''
    checks if an item is inside the house information we are looking for.
    Returns True if it is. Else it returns False
    '''
    house_info = [
        'Adress','Boligtype','Eieform bolig','Soverom','Primærrom','Bruksareal',
        'Grunnflate','Etasje','Byggeår','Rom',
        'Tomteareal','Bruttoareal','Areal',]
    if item in house_info:
        return True
    else:
        return False

def vali_has_numbers(string):
    '''
    Checks wheter a string contains numbers. Returns True if there is numbers in string, or False otherwise
    '''
    return any(char.isdigit() for char in string)

def vali_house_object(price_info_dict,house_info_dict):
    errors=0
    if type(price_info_dict['Prisantydning']) != int:
        errors+=1
        print('Prisantydning is not int')
    try:
        house_info_dict['Bruttoareal']==0
        errors+=1
        print('Bruttoareal = 0')
    except KeyError:
        errors+=1
        print('Bruttoareal does not exist')
    if errors == 0:
        return True
    else:
        return False

# Support functions

In [8]:
def soup_object(url):
    '''
    Takes in an url and makes a soup object 
    '''
    page = requests.get(url)
    soup = BS(page.content,'html.parser')
    return soup

def find_adress(soup):
    '''
    Returns the adress of the apartment
    '''
    adress = soup.find_all('p',class_='u-caption')[0].get_text()
    return adress

def find_price(soup):
    '''
    Returns the asking price of the apartment
    '''
    all_objects = soup.find_all('span',class_='u-t3')
    price = ''
    for x in all_objects:
        if 'kr' in x.get_text():
            raw_price = x.get_text()
            price = unidecode(raw_price)
    #Removes white space and \n if in the price
    if '\n' in price:
        price = price.splitlines()[1].strip()
    if price == '':
        price = 'Error in find price function'
        return price
    else:
        price = convert_to_int(price)
        return price
    
def convert_to_int(string):
    if string == '':
        return 0
    else:
        return int(''.join(filter(str.isdigit, string)))

def find_house_information(soup):
    ''''''
    #Find all the information
    raw_description = soup.find_all('dt')
    raw_value = soup.find_all('dd')
    #Empty lists used for formating
    description = []
    value = []
    price_info_dict={}
    house_info_dict={}
    #Populating the lists with cleaned information
    for x in raw_description:
        description.append(x.get_text())
    for y in raw_value:
        formated_value = y.get_text()
        cleaned_value = unidecode(formated_value)
        if '\n' in cleaned_value:
            cleaned_value = cleaned_value.splitlines()[1].strip()
        if vali_has_numbers(cleaned_value):
            if 'm2' in cleaned_value:
                cleaned_value = cleaned_value.strip('m2')
                cleaned_value = convert_to_int(cleaned_value)
            elif 'm²' in cleaned_value:
                cleaned_value = cleaned_value.strip('m²')
                cleaned_value = convert_to_int(cleaned_value)
            else:
                cleaned_value = convert_to_int(cleaned_value)
        value.append(cleaned_value)
    #Creating empty dict to populate with formatted and clean values
    information = {}
    information['Prisantydning']= find_price(soup)
    information['Adress']=find_adress(soup)
    for x,y in zip(description,value):
        information[x]=y
    #Sort information after validators
    for item in information.items():
        if vali_price_info(item[0]):
            price_info_dict[item[0]]=item[1]
        elif vali_house_info(item[0]):
            house_info_dict[item[0]]=item[1]
    return price_info_dict,house_info_dict

def find_links_to_ads(link):
    cut='page='
    annonser=[]
    if cut in link:
        string_1= link[:link.index(cut)+len(cut)]
        string_2= link[link.index(cut)+len(cut)+1:]

        for x in range(10):
            search = string_1+str(x)+string_2
            soup = soup_object(search)
            all_objects = soup.find_all('a',class_='ads__unit__link')
            all_objects = all_objects[1:]
            if len(all_objects)!= 0:
                for y in all_objects:
                    ad_link = 'http://finn.no'+y['href']
                    annonser.append(ad_link)
            else:
                print('Did not find more ads, search ended at page: {}'.format(x))
                break
    else:
        soup = soup_object(link)
        all_objects = soup.find_all('a',class_='ads__unit__link')
        all_objects = all_objects[1:]
        for y in all_objects:
            ad_link = 'http://finn.no'+y['href']
            annonser.append(ad_link)

    return annonser

def find_zip(adress):
    '''
    Extracts the zip code from and adress string formated with adresss, zip city
    '''
    res = [int(i) for i in adress.split() if i.isdigit()]
    if len(res)==1:
        return res[0]
    else:
        return res[1]

def find_picture_url(soup):
    img = soup.find_all('img',class_="img-format__img u-border-radius-8px")[0]
    url = img['srcset'].split(',')[0].split(' ')[0]
    return url

# Objects

In [4]:
class HouseClass:
    def __init__(self,soup):
        self.adress,self.district = find_adress(soup)
        self.price,self.house = find_house_information(soup)
    
    def __str__(self):
        return self.adress

# Script for list of apartment

In [5]:
link = 'https://www.finn.no/realestate/homes/search.html?filters=&location=1.22038.20131&published=1'

annonser= find_links_to_ads(link)

results={}

for apartment in annonser:
    soup = soup_object(apartment)
    if vali_only_one_price(soup):
        results[apartment]=HouseClass(soup)
    else:
        print('Apartment: {} is an complex with many apartments'.format(apartment))
        
print('<------------------- Results---------------------->')

for x,y in results.items():
    print(x)
    print('District {}'.format(y.district))
    print(y)
    print(y.price)
    print(y.house)
    print('<------------------- New Item---------------------->')

ValueError: substring not found

In [11]:
annonser = find_links_to_ads('https://www.finn.no/realestate/homes/search.html?filters=&location=1.22038.20131&published=1')

In [12]:
annonser

[]

# Testing

In [13]:
#soup = soup_object(link)
#all_objects = soup.find_all('a',class_='ads__unit__link')
#all_objects = all_objects[1:]
 #for y in all_objects:
    #ad_link = 'http://finn.no'+y['href']
    #annonser.append(ad_link)

In [24]:
soup = soup_object('https://www.finn.no/realestate/homes/search.html?filters=&location=1.22038.20131&location=1.22038.20134&published=1')

In [25]:
all_objects = soup.find_all('a',class_='ads__unit__link')

In [27]:
for ad in all_objects[1:]:
    print('http://finn.no'+ad['href'])

http://finn.nohttps://www.finn.no/realestate/homes/ad.html?finnkode=200633580
http://finn.nohttps://www.finn.no/eiendom/nybygg/prosjekt?finnkode=172690872&location=1.22038.20131&location=1.22038.20134&published=1
http://finn.nohttps://www.finn.no/realestate/homes/ad.html?finnkode=200667547


In [29]:
link='https://www.finn.no/realestate/homes/search.html?filters=&location=1.22038.20134&published=1'
soup = soup_object(link)
annonser=[]
all_objects = soup.find_all('a',class_='ads__unit__link')
all_objects = all_objects[1:]
for y in all_objects:
    ad_link = y['href']
    annonser.append(ad_link)

In [30]:
annonser

['https://www.finn.no/realestate/homes/ad.html?finnkode=200633580',
 'https://www.finn.no/eiendom/nybygg/prosjekt?finnkode=172690872&location=1.22038.20134&published=1',
 'https://www.finn.no/realestate/homes/ad.html?finnkode=200667547']