In [3]:
import numpy as np
import pandas as pd 
import requests
from bs4 import BeautifulSoup
import pandas as pd
import tqdm as tqdm
from lxml import html
import time

In [109]:
class HouseForSale: 
    '''
    Getting desired infos from the webpage provided
    '''
    def __init__(self, link):
        '''
        Input a link in order to extract the necessary information
        '''
        self.link = link

    def getSoup(self):
        r = requests.get(self.link)
        r.encoding = 'UTF-8'
        return BeautifulSoup(r.text, 'html')
    
    def getParameters(self):
        '''
        Get parameters such as price, disposition, the state of the house
        '''
        bf = self.getSoup()
        bf_table = bf.find('div',{'data-element':"detail-description"}).find('table')
        table_extract = [i.text.strip() for i in bf_table.findAll('tr')]
        df_table = pd.DataFrame(table_extract)
        df_table_split = df_table[0].str.split("\n", expand = True)
        df = pd.DataFrame(df_table_split[[0,1]])
        df.columns = ['metric', 'value']
        df = df.set_index('metric')
        return df
    
    def getMap(self):
        '''
        Get location of the property using embeded google maps longtitute and lattitude parameters
        '''
        bf = self.getSoup()
        bf_map = bf.find('div',{'id':"map"})
        x = bf_map.find('iframe')['src'].find('q=') + 2 
        y = bf_map.find('iframe')['src'].find('&key')
        location = bf_map.find('iframe')['src'][x:y]
        loc_str = location.split(',')
        loc_dict = {"lat" : [loc_str[0]], "long": [loc_str[1]]}
        loc_df = pd.DataFrame.from_dict(loc_dict, orient = 'index', columns = ['value'])
        return loc_df
    
    def getDf(self):
        '''
        Return a wide dataframe from getMap() and getParameters(), index = latitute and longtitude
        '''
        df_par = self.getParameters()
        df_map = self.getMap()
        df = pd.concat([df_par, df_map])
        df = df.T.set_index('lat', 'long')
        return df

In [107]:
one_house = HouseForSale('https://www.bezrealitky.cz/nemovitosti-byty-domy/595733-nabidka-prodej-bytu')

# Downloader class
- Manual test
- defining class

## Manual test

In [12]:
def getLinks_test(link):
    
    page = requests.get(link)
    if page.status_code == requests.codes.ok:
        wp = getSoup_test(link)
        base_url = 'https://www.bezrealitky.cz'
        links = wp.findAll('div', {'product__body'})
        return [base_url + equity.find('a')['href'] for equity in links]

    next_page_text = wp.find('ul', class_="pagination justify-content-md-end").findAll('li')[-1].text

    if next_page_text == 'Další >':
        next_page_partial = wp.find('ul', class_="pagination justify-content-md-end").findAll('li')[-1].find('a')['href']
        next_page_url = base_url + next_page_partial
        links = wp.findAll('div', {'product__body'})
        return [base_url + equity.find('a')['href'] for equity in links]
    # No more 'Next' pages, finish the script
    else:
        print('Done')

In [13]:
l = "https://www.bezrealitky.cz/vypis/nabidka-prodej/byt/praha?_token=qNLlM7SSvDXIFrdOKHnbj0gv_-OMf7vw6yUc_-ILcFo"
links = getLinks_test(l)

In [14]:
HouseForSale(links[0]).getDf()

Unnamed: 0,value
Internet:,
Číslo inzerátu:,602006
Dispozice:,4+1
Plocha:,95 m²
Cena:,3.290.000 Kč
Město:,Praha
Městská část:,Řepy
Typ vlastnictví:,Družstevní
Typ budovy:,Panel
PENB:,D


## Defining class
Manual test working - Downloader_single() working
Pagination - Downloader_multi() work in progress

In [16]:
# Downloader(l)

In [17]:
class Downloader_single:
    '''
    Download all links real estate to initiate
    '''
    def __init__(self, link):
        '''
        Give link to extract links of given webpages
        '''
        self.link = link
    
    def getSoup(self):
        r = requests.get(self.link)
        r.encoding = 'UTF-8'
        return BeautifulSoup(r.text, 'lxml')

    def getLinks(self):
        page = requests.get(self.link)
        if page.status_code == requests.codes.ok:
            wp = self.getSoup()
            base_url = 'https://www.bezrealitky.cz'
            links = wp.findAll('div', {'product__body'})
            return [base_url + equity.find('a')['href'] for equity in links]

In [18]:
Downloader_single('https://www.bezrealitky.cz/vypis/nabidka-prodej/byt/praha?_token=qNLlM7SSvDXIFrdOKHnbj0gv_-OMf7vw6yUc_-ILcFo&page=3').getLinks()

['https://www.bezrealitky.cz/nemovitosti-byty-domy/604370-nabidka-prodej-bytu-spanielova-praha',
 'https://www.bezrealitky.cz/nemovitosti-byty-domy/596362-nabidka-prodej-bytu-merhoutova-prague',
 'https://www.bezrealitky.cz/nemovitosti-byty-domy/604185-nabidka-prodej-bytu-ucnovska',
 'https://www.bezrealitky.cz/nemovitosti-byty-domy/604354-nabidka-prodej-bytu-sanderova',
 'https://www.bezrealitky.cz/nemovitosti-byty-domy/602506-nabidka-prodej-bytu-nekvasilova',
 'https://www.bezrealitky.cz/nemovitosti-byty-domy/604357-nabidka-prodej-bytu-mantovska',
 'https://www.bezrealitky.cz/nemovitosti-byty-domy/604332-nabidka-prodej-bytu-spanielova-praha',
 'https://www.bezrealitky.cz/nemovitosti-byty-domy/597584-nabidka-prodej-bytu-v-remizku-praha',
 'https://www.bezrealitky.cz/nemovitosti-byty-domy/603606-nabidka-prodej-bytu-tulesicka',
 'https://www.bezrealitky.cz/nemovitosti-byty-domy/604353-nabidka-prodej-bytu-za-zameckem-praha']

In [19]:
class Downloader_multi:
    '''
    Download all links real estate to initiate
    '''
    def __init__(self, link):
        '''
        Give link to extract links of given webpages
        '''
        self.link = link
    
    def getSoup(self):
        r = requests.get(self.link)
        r.encoding = 'UTF-8'
        return BeautifulSoup(r.text, 'lxml')

    def getLinks(self):
        page = requests.get(self.link)
        if page.status_code == requests.codes.ok:
            wp = self.getSoup()
            base_url = 'https://www.bezrealitky.cz'
            links = wp.findAll('div', {'product__body'})
            return [base_url + equity.find('a')['href'] for equity in links]
        
#         if next_page_text == 'Další >':
#             next_page_partial = wp.find('ul', class_="pagination justify-content-md-end").findAll('li')[-1].find('a')['href']
#             next_page_url = base_url + next_page_partial
#             getLinks(next_page_url)
#         # No more 'Next' pages, finish the script
#         else:
#             print('Done')
        
            
        

In [20]:
Downloader_single('https://www.bezrealitky.cz/vypis/nabidka-prodej/byt/praha?_token=qNLlM7SSvDXIFrdOKHnbj0gv_-OMf7vw6yUc_-ILcFo&page=63').getLinks()

[]

In [34]:
class Downloader_multi_byJan:
    '''
    Download all links real estate to initiate
    '''
    def __init__(self, link):
        '''
        Give link to extract links of given webpages
        '''
        self.link = link
    
    def getSoup(self):
            r = requests.get(link)
            r.encoding = 'UTF-8'
            return BeautifulSoup(r.text, 'lxml')

    def getSoupNext(link): 
            r = requests.get(link)
            r.encoding = 'UTF-8'
            return BeautifulSoup(r.text, 'lxml')

    def getAllPages(self):    
            wp = self.getSoup(link)
            base_url = 'https://www.bezrealitky.cz'
            test_element = wp.find('ul', class_="pagination justify-content-md-end").find('a', rel = 'next')
            pglinks = pd.Series(['https://www.bezrealitky.cz/vypis/nabidka-prodej/byt/praha'])
            while test_element != None:
                    partial_url = wp.find('ul', class_="pagination justify-content-md-end").find('a', rel = 'next')['href']
                    next_url = base_url + partial_url
                    pglinks = pglinks.append(pd.Series([next_url]), ignore_index = True)
                    wp_next = self.getSoupNext(next_url)
                    test_element = wp_next.find('ul', class_="pagination justify-content-md-end").find('a', rel = 'next')
                    wp = wp_next
            else: print("Loop is done")
            # return pglinks
        
    def getLinks(self):
            page = requests.get(link)
            if page.status_code == requests.codes.ok:
                for pg in pglinks:

                    wp = self.getSoup()
                    base_url = 'https://www.bezrealitky.cz'
                    eqlinks = wp.findAll('div', {'product__body'})
                    return [base_url_prague + equity.find('a')['href'] for equity in eqlinks]


In [27]:
Webpage = Downloader_multi_byJan('https://www.bezrealitky.cz/vypis/nabidka-prodej/byt/praha')

In [35]:
Webpage.getSoupNext('https://www.bezrealitky.cz/vypis/nabidka-prodej/byt/praha')

TypeError: getSoupNext() takes 1 positional argument but 2 were given

In [15]:
class Downloader_multi_byTiep:
    '''
    Download all links real estate to initiate
    '''
    def __init__(self, link):
        '''
        Give link to extract links of given webpages
        '''
        self.link = link
    
    def getSoup(self):
            r = requests.get(link)
            r.encoding = 'UTF-8'
            return BeautifulSoup(r.text, 'lxml')

    def getSoupNext(link): 
            r = requests.get(link)
            r.encoding = 'UTF-8'
            return BeautifulSoup(r.text, 'lxml')

    def getLinks(self):
            page = requests.get(link)
            if page.status_code == requests.codes.ok:
                wp = self.getSoup()
                base_url = 'https://www.bezrealitky.cz'
                links = wp.findAll('div', {'product__body'})
                return [base_url_prague + equity.find('a')['href'] for equity in links]

    def getAllPages(self):    
            wp = getSoup_test(link)
            base_url = 'https://www.bezrealitky.cz'
            test_element = wp.find('ul', class_="pagination justify-content-md-end").find('a', rel = 'next')
            links = pd.Series(['https://www.bezrealitky.cz/vypis/nabidka-prodej/byt/praha'])
            while test_element != None:
                    partial_url = wp.find('ul', class_="pagination justify-content-md-end").find('a', rel = 'next')['href']
                    next_url = base_url + partial_url
                    links = links.append(pd.Series([next_url]), ignore_index = True)
                    wp_next = getSoupNext_test(next_url)
                    test_element = wp_next.find('ul', class_="pagination justify-content-md-end").find('a', rel = 'next')
                    wp = wp_next
            else: print("Loop is done")
            return links

In [17]:
Downloader_multi_byTiep('https://www.bezrealitky.cz/vypis/nabidka-prodej/byt/praha').getAllPages()

NameError: name 'getSoup_test' is not defined

In [24]:
def getSoup_test(link):
        r = requests.get(link)
        r.encoding = 'UTF-8'
        return BeautifulSoup(r.text, 'lxml')
    
def getSoupNext_test(link): 
        r = requests.get(link)
        r.encoding = 'UTF-8'
        return BeautifulSoup(r.text, 'lxml')

def getLinks_test(link):
        page = requests.get(link)
        if page.status_code == requests.codes.ok:
            wp = self.getSoup()
            base_url = 'https://www.bezrealitky.cz'
            links = wp.findAll('div', {'product__body'})
            return [base_url_prague + equity.find('a')['href'] for equity in links]
    
def getAllPages_test(link):    
        wp = getSoup_test(link)
        base_url = 'https://www.bezrealitky.cz'
        test_element = wp.find('ul', class_="pagination justify-content-md-end").find('a', rel = 'next')
        links = pd.Series(['https://www.bezrealitky.cz/vypis/nabidka-prodej/byt/praha'])
        while test_element != None:
                partial_url = wp.find('ul', class_="pagination justify-content-md-end").find('a', rel = 'next')['href']
                next_url = base_url + partial_url
                links = links.append(pd.Series([next_url]), ignore_index = True)
                wp_next = getSoupNext_test(next_url)
                test_element = wp_next.find('ul', class_="pagination justify-content-md-end").find('a', rel = 'next')
                wp = wp_next
        else: print("Loop is done")
        return links

In [25]:
links = getAllPages_test('https://www.bezrealitky.cz/vypis/nabidka-prodej/byt/praha')
links

Loop is done


0     https://www.bezrealitky.cz/vypis/nabidka-prode...
1     https://www.bezrealitky.cz/vypis/nabidka-prode...
2     https://www.bezrealitky.cz/vypis/nabidka-prode...
3     https://www.bezrealitky.cz/vypis/nabidka-prode...
4     https://www.bezrealitky.cz/vypis/nabidka-prode...
5     https://www.bezrealitky.cz/vypis/nabidka-prode...
6     https://www.bezrealitky.cz/vypis/nabidka-prode...
7     https://www.bezrealitky.cz/vypis/nabidka-prode...
8     https://www.bezrealitky.cz/vypis/nabidka-prode...
9     https://www.bezrealitky.cz/vypis/nabidka-prode...
10    https://www.bezrealitky.cz/vypis/nabidka-prode...
11    https://www.bezrealitky.cz/vypis/nabidka-prode...
12    https://www.bezrealitky.cz/vypis/nabidka-prode...
13    https://www.bezrealitky.cz/vypis/nabidka-prode...
14    https://www.bezrealitky.cz/vypis/nabidka-prode...
15    https://www.bezrealitky.cz/vypis/nabidka-prode...
16    https://www.bezrealitky.cz/vypis/nabidka-prode...
17    https://www.bezrealitky.cz/vypis/nabidka-p

In [26]:
links[1]

'https://www.bezrealitky.cz/vypis/nabidka-prodej/byt/praha?page=2'

In [27]:
print(links[0])

https://www.bezrealitky.cz/vypis/nabidka-prodej/byt/praha


In [28]:
links = pd.Series(['https://www.bezrealitky.cz/vypis/nabidka-prodej/byt/praha'])
links
links.append(pd.Series(['https://www.bezrealitky.cz/vypis/nabidka-prodej/byt/praha?page=2']))
links

0    https://www.bezrealitky.cz/vypis/nabidka-prode...
dtype: object

In [29]:
soup = getSoup_test('https://www.bezrealitky.cz/vypis/nabidka-prodej/byt/praha?page=60')

In [30]:
soup.find('ul', class_="pagination justify-content-md-end").find('a', rel = 'next') != None

True

## Tutorial for pagination
link - https://letslearnabout.net/python/beautiful-soup/how-to-get-the-next-page-on-beautiful-soup/#next-page

In [None]:
def parse_page(next_url):
    # HTTP GET requests
  page = requests.get(next_url)

  # Checking if we successfully fetched the URL
  if page.status_code == requests.codes.ok:
    bs = BeautifulSoup(page.text, 'lxml')

    check_no_results = bs.find('ul', class_="SearchResults").find('p')
    if check_no_results and check_no_results.text:
        print('Search returned no results.')
        return None

    # Fetching all items
    list_all_cd = bs.findAll('li', class_='ResultItem')

    
    for cd in list_all_cd:
      get_cd_attributes(cd)

    next_page_text = bs.find('ul', class_="SearchBreadcrumbs").findAll('li')[-1].text
    
    if next_page_text == 'Next':
        next_page_partial = bs.find('ul', class_="SearchBreadcrumbs").findAll(
            'li')[-1].find('a')['href']
        next_page_url = base_url + next_page_partial
        print(next_page_url)
        parse_page(next_page_url)
    # No more 'Next' pages, finish the script
    else:
        export_table_and_print(data)

parse_page(search_url)