# PythonDataIES project - bezrealitky.cz
Authors: Tiep Luu Danh, Jan Malecha

In [4]:
import numpy as np
import pandas as pd 
import requests
from bs4 import BeautifulSoup
import pandas as pd
import tqdm as tqdm
from lxml import html
import time
import itertools

## Bezrealitky pages class structure

**Estate (parent)** - class containing general methods for parsing bezrealitky websites
 - **Flat (child)** - standard properties, no special naming of methods as it type of majority of observations
 - **NBFlat (child)** - new-build properties, a group of new build estates that have very different page, methods for this class end with NB

In [8]:
class Estate:
    '''
    Defined as parent class for all possible version of bezrealitky pages
    
    Containing general methods used in scraping these websites
    '''
    def __init__(self, link):
        self.link = link
        self.soup = self.getSoup
        
    def getSoup(self):
        '''
        Initialize soup object.
        '''
        r = requests.get(self.link)
        r.encoding = 'UTF-8'
        
        return BeautifulSoup(r.text, 'html')
    
    

In [13]:
class Flat(Estate):
    def __init__(self, link):
        '''
        Constructor for Flat calls parents Estate constructor first,
        where self.link and self.soup are created
        
        Then flat parameters and coordinates are generated as Flat attributes
        '''
        # Calling Estate constructor
        super().__init__(link)
        
        # getting soup
        self.soup = self.getSoup()
        
        # Getting paramaters of flat
        self.parameters = self.getParameters()
        
        # Getting coordinates of flat
        self.coordinates = self.getMap()
        
    def getParameters(self):
        '''
        Get parameters of given estate - such as price, disposition, the state of the house, etc.
        '''
        bf_table = self.soup.find('div',{'data-element':"detail-description"}).find('table')
        table_extract = [i.text.strip() for i in bf_table.findAll('tr')]
        df_table = pd.DataFrame(table_extract)
        df_table_split = df_table[0].str.split("\n", expand = True)
        df = pd.DataFrame(df_table_split[[0,1]])
        df.columns = ['metric', 'value']
        df = df.set_index('metric')
        
        return df
        
    def getMap(self):
        '''
        Get location of the property using embeded Google maps longtitute and lattitude parameters.
        '''
        bf_map = self.soup.find('div',{'id':"map"})
        x = bf_map.find('iframe')['src'].find('q=') + 2 
        y = bf_map.find('iframe')['src'].find('&key')
        location = bf_map.find('iframe')['src'][x:y]
        loc_str = location.split(',')
        loc_dict = {"lat" : [loc_str[0]], "long": [loc_str[1]]}
        loc_df = pd.DataFrame.from_dict(loc_dict, orient = 'index', columns = ['value'])
        
        return loc_df
        
    def getDf(self):
        '''
        Return a wide dataframe from getMap() and getParameters(), index = latitute and longtitude
        '''
        df_par = self.getParameters()
        df_map = self.getMap()
        df = pd.concat([df_par, df_map])
        df = pd.melt(df.T, id_vars = ['lat','long'])
        return df    

**Example:**

In [16]:
Flat('https://www.bezrealitky.cz/nemovitosti-byty-domy/603752-nabidka-prodej-bytu-mezi-skolami-praha').getDf()

Unnamed: 0,lat,long,variable,value
0,50.052594,14.342979,Internet:,
1,50.052594,14.342979,Číslo inzerátu:,603752
2,50.052594,14.342979,Dispozice:,3+kk
3,50.052594,14.342979,Plocha:,82 m²
4,50.052594,14.342979,Cena:,5.690.000 Kč
5,50.052594,14.342979,Město:,Praha
6,50.052594,14.342979,Městská část:,Stodůlky
7,50.052594,14.342979,Typ vlastnictví:,Osobní
8,50.052594,14.342979,Typ budovy:,Panel
9,50.052594,14.342979,PENB:,C


In [15]:
def NBFlat(Estate):
    def __init__(self, link):
        '''
        Constructor for new-build Flat calls parents Estate constructor first,
        where self.link and self.soup are created
        
        Then new-build flat parameters and (coordinates) are generated as NBFlat attributes
        '''
        # Calling Estate constructor
        super().__init__(link)
        
        # Getting soup
        self.soup = self.getSoup()
        
        # Getting paramaters of flat
        self.parameters = self.getParametersNB()
        
        # Getting coordinates of flat
#         self.coordinates = self.getMapNB()
    
    def getParametersNB(self): 
        '''
        For new-build properties get parameters such as price, disposition, the state of the house
        '''
        bf_table = self.soup.find('div', {'id':'tabInformace'}).find('table')
        bf_table
        table_extract = [li.text.strip() for li in bf_table.findAll('tr', {'class':'cline'})]
        df_table  = pd.DataFrame(table_extract)
        df_table_split = df_table[0].str.split("\n", expand = True)
        pd.DataFrame(df_table_split[[0,1]], columns = ['metric', 'value'])
        df  = pd.concat(
                        [pd.DataFrame(np.array(df_table_split[[0,1]]), columns = ['metric', 'value']),
                        pd.DataFrame(np.array(df_table_split[[2,3]]), columns = ['metric', 'value'])]
                        )

        return df
    
    def getDf(self):
        '''
        Return a wide dataframe from getMapNB() and getParametersNB(), index = latitute and longtitude
        '''
        df_par = self.getParameters()
        return df_par
#         df_map = self.getMap()
#         df = pd.concat([df_par, df_map])
#         df = pd.melt(df.T, id_vars = ['lat','long'])
#         return df
    
        
# FOLLOWING METHOD is currently work in progress
#     def getMapNB(self) 
#         '''
#         Get location of the property using embeded google maps longtitute and lattitude parameters
#         '''
#         bf = self.getSoup()
#         bf_map = bf.find('div',{'id':"map"})
#         x = bf_map.find('iframe')['src'].find('q=') + 2 
#         y = bf_map.find('iframe')['src'].find('&key')
#         location = bf_map.find('iframe')['src'][x:y]
#         loc_str = location.split(',')
#         loc_dict = {"lat" : [loc_str[0]], "long": [loc_str[1]]}
#         loc_df = pd.DataFrame.from_dict(loc_dict, orient = 'index', columns = ['value'])
        
#         return loc_df
    

# Downloader class

In [21]:
class LinkManager:
    '''
    Enables using multiple link in Downloader
    '''
    def __init__(self, link):
        self.link = link
        self.start_num = 1
        self.soup = self.getSoup()
        
    def getSoup(self):
            r = requests.get(link)
            r.encoding = 'UTF-8'
            return BeautifulSoup(r.text, 'lxml')
    
    def generate_offerpages(self):
        '''
        Generates list of all pages for specific search
        '''
        all_pages = [self.link]
        start_num = self.start_num
        last_page = int(self.soup.find('ul', class_="pagination justify-content-md-end").findAll('li')[-2].text)

        for i in range(start_num, last_page):
            offer_page = self.link + f'&page={i+1}'  # adding a page number at the end of each url
            all_pages.append(offer_page) # storage of tables for each flat

        return all_pages

    def getLinks(self):
            page = requests.get(link)
            if page.status_code == requests.codes.ok:
                wp = getSoup(link)
                base_url = 'https://www.bezrealitky.cz'
                links = wp.findAll('div', {'product__body'})
                links_list = list()
                for equity in links:
                        if 'https://www.bezrealitky.cz' in equity.find('a')['href']:
                            links_list.append(equity.find('a')['href'])
                        else: 
                            links_list.append(base_url + equity.find('a')['href'])
            return links_list

In [50]:
class Downloader:
    '''
    Download all links of real estate properties on the given website
    '''
    def __init__(self, link):
        '''
        Provide real estate webpage to extract the properties links
        '''
        self.link = link
        self.start_num = 1
        self.soup = self.getSoup()
        self.pages = self.getPages()
        self.links = self.getLinks()
        
    def getSoup(self):
        r = requests.get(link)
        r.encoding = 'UTF-8'
        return BeautifulSoup(r.text, 'lxml')

        
    def getPages(self):
        '''
        Generates list of all pages for specific search
        '''
        all_pages = [self.link] # create a list of pages
        start_num = self.start_num # first page
        last_page = int(self.soup.find('ul', class_="pagination justify-content-md-end").findAll('li')[-2].text) # number of last page

        for i in range(start_num, last_page):
            offer_page = self.link + f'&page={i+1}'  # adding a page number at the end of each url
            all_pages.append(offer_page) # storage of tables for each flat

        return all_pages
    
    def getLinks(self):
        '''
        Get links of flats from a page
        '''    
        for p in self.pages:
            if requests.get(p).status_code == requests.codes.ok:
                
                base_url = 'https://www.bezrealitky.cz'
                links = self.soup.findAll('div', {'product__body'})
                links_list = list()
                for equity in links:
                        if 'https://www.bezrealitky.cz' in equity.find('a')['href']:
                            links_list.append(equity.find('a')['href'])
                        else: 
                            links_list.append(base_url + equity.find('a')['href'])
            return links_list
        

In [51]:
dl = Downloader('https://www.bezrealitky.cz/vypis/nabidka-prodej/byt/praha/praha-karlin?_token=Yve3Tz0ogvRcbrMRpzUq5Kjn-3SXp81q06ehOhiZvrw')
print(np.shape(dl.getPages()))

(292,)


In [5]:
links_class = Downloader_multi_byTiep('https://www.bezrealitky.cz/vypis/nabidka-prodej/byt/praha/praha-zizkov').getAllPages()

Loop is done


In [6]:
links_properties = [getLinks(li) for li in links_class]
links_properties_unlist = list(itertools.chain(*links_properties))

In [7]:
links_properties_unlist = list(itertools.chain(*links_properties))