In [234]:
import numpy as np
import pandas as pd 
import requests
from bs4 import BeautifulSoup
import pandas as pd
import tqdm as tqdm
from lxml import html
import time
import itertools

In [188]:
class HouseForSale: 
    '''
    Getting desired infos from the webpage provided
    '''
    def __init__(self, link):
        '''
        Input a link in order to extract the necessary information
        '''
        self.link = link

    def getSoup(self):
        r = requests.get(self.link)
        r.encoding = 'UTF-8'
        return BeautifulSoup(r.text, 'html')
    
    def getParameters(self):
        '''
        Get parameters such as price, disposition, the state of the house
        '''
        bf = self.getSoup()
        bf_table = bf.find('div',{'data-element':"detail-description"}).find('table')
        table_extract = [i.text.strip() for i in bf_table.findAll('tr')]
        df_table = pd.DataFrame(table_extract)
        df_table_split = df_table[0].str.split("\n", expand = True)
        df = pd.DataFrame(df_table_split[[0,1]])
        df.columns = ['metric', 'value']
        df = df.set_index('metric')
        return df
    
    def getMap(self):
        '''
        Get location of the property using embeded google maps longtitute and lattitude parameters
        '''
        bf = self.getSoup()
        bf_map = bf.find('div',{'id':"map"})
        x = bf_map.find('iframe')['src'].find('q=') + 2 
        y = bf_map.find('iframe')['src'].find('&key')
        location = bf_map.find('iframe')['src'][x:y]
        loc_str = location.split(',')
        loc_dict = {"lat" : [loc_str[0]], "long": [loc_str[1]]}
        loc_df = pd.DataFrame.from_dict(loc_dict, orient = 'index', columns = ['value'])
        return loc_df
    
    def getDf(self):
        '''
        Return a wide dataframe from getMap() and getParameters(), index = latitute and longtitude
        '''
        df_par = self.getParameters()
        df_map = self.getMap()
        df = pd.concat([df_par, df_map])
        df = pd.melt(df.T, id_vars = ['lat','long'])
        return df

In [203]:
house_1 = HouseForSale('https://www.bezrealitky.cz/nemovitosti-byty-domy/522228-nabidka-prodej-bytu-plzenska-hlavni-mesto-praha').getDf()
house_1 = house_1.set_index(['lat','long'])

In [204]:
house_2 = HouseForSale('https://www.bezrealitky.cz/nemovitosti-byty-domy/603644-nabidka-prodej-bytu-nuslova-prague').getDf()
house_2 = house_2.set_index(['lat', 'long'])

In [210]:
pd.concat([df_test,house_1, house_2])

Unnamed: 0_level_0,Unnamed: 1_level_0,variable,value
lat,long,Unnamed: 2_level_1,Unnamed: 3_level_1
50.0707253,14.3752353,Internet:,
50.0707253,14.3752353,Číslo inzerátu:,522228
50.0707253,14.3752353,Dispozice:,3+1
50.0707253,14.3752353,Plocha:,75 m²
50.0707253,14.3752353,Cena:,6.400.000 Kč
50.0707253,14.3752353,Město:,Praha
50.0707253,14.3752353,Městská část:,Smíchov
50.0707253,14.3752353,Typ vlastnictví:,Osobní
50.0707253,14.3752353,Typ budovy:,Panel
50.0707253,14.3752353,PENB:,A


# Downloader class
- Manual test
- defining class

## Manual test

In [12]:
def getLinks_test(link):
    
    page = requests.get(link)
    if page.status_code == requests.codes.ok:
        wp = getSoup_test(link)
        base_url = 'https://www.bezrealitky.cz'
        links = wp.findAll('div', {'product__body'})
        return [base_url + equity.find('a')['href'] for equity in links]

    next_page_text = wp.find('ul', class_="pagination justify-content-md-end").findAll('li')[-1].text

    if next_page_text == 'Další >':
        next_page_partial = wp.find('ul', class_="pagination justify-content-md-end").findAll('li')[-1].find('a')['href']
        next_page_url = base_url + next_page_partial
        links = wp.findAll('div', {'product__body'})
        return [base_url + equity.find('a')['href'] for equity in links]
    # No more 'Next' pages, finish the script
    else:
        print('Done')

In [13]:
l = "https://www.bezrealitky.cz/vypis/nabidka-prodej/byt/praha?_token=qNLlM7SSvDXIFrdOKHnbj0gv_-OMf7vw6yUc_-ILcFo"
links = getLinks_test(l)

In [14]:
HouseForSale(links[0]).getDf()

Unnamed: 0,value
Internet:,
Číslo inzerátu:,602006
Dispozice:,4+1
Plocha:,95 m²
Cena:,3.290.000 Kč
Město:,Praha
Městská část:,Řepy
Typ vlastnictví:,Družstevní
Typ budovy:,Panel
PENB:,D


## Defining class
Manual test working - Downloader_single() working
Pagination - Downloader_multi() work in progress

In [16]:
# Downloader(l)

In [17]:
class Downloader_single:
    '''
    Download all links real estate to initiate
    '''
    def __init__(self, link):
        '''
        Give link to extract links of given webpages
        '''
        self.link = link
    
    def getSoup(self):
        r = requests.get(self.link)
        r.encoding = 'UTF-8'
        return BeautifulSoup(r.text, 'lxml')

    def getLinks(self):
        page = requests.get(self.link)
        if page.status_code == requests.codes.ok:
            wp = self.getSoup()
            base_url = 'https://www.bezrealitky.cz'
            links = wp.findAll('div', {'product__body'})
            return [base_url + equity.find('a')['href'] for equity in links]

In [18]:
Downloader_single('https://www.bezrealitky.cz/vypis/nabidka-prodej/byt/praha?_token=qNLlM7SSvDXIFrdOKHnbj0gv_-OMf7vw6yUc_-ILcFo&page=3').getLinks()

['https://www.bezrealitky.cz/nemovitosti-byty-domy/604370-nabidka-prodej-bytu-spanielova-praha',
 'https://www.bezrealitky.cz/nemovitosti-byty-domy/596362-nabidka-prodej-bytu-merhoutova-prague',
 'https://www.bezrealitky.cz/nemovitosti-byty-domy/604185-nabidka-prodej-bytu-ucnovska',
 'https://www.bezrealitky.cz/nemovitosti-byty-domy/604354-nabidka-prodej-bytu-sanderova',
 'https://www.bezrealitky.cz/nemovitosti-byty-domy/602506-nabidka-prodej-bytu-nekvasilova',
 'https://www.bezrealitky.cz/nemovitosti-byty-domy/604357-nabidka-prodej-bytu-mantovska',
 'https://www.bezrealitky.cz/nemovitosti-byty-domy/604332-nabidka-prodej-bytu-spanielova-praha',
 'https://www.bezrealitky.cz/nemovitosti-byty-domy/597584-nabidka-prodej-bytu-v-remizku-praha',
 'https://www.bezrealitky.cz/nemovitosti-byty-domy/603606-nabidka-prodej-bytu-tulesicka',
 'https://www.bezrealitky.cz/nemovitosti-byty-domy/604353-nabidka-prodej-bytu-za-zameckem-praha']

In [19]:
class Downloader_multi:
    '''
    Download all links real estate to initiate
    '''
    def __init__(self, link):
        '''
        Give link to extract links of given webpages
        '''
        self.link = link
    
    def getSoup(self):
        r = requests.get(self.link)
        r.encoding = 'UTF-8'
        return BeautifulSoup(r.text, 'lxml')

    def getLinks(self):
        page = requests.get(self.link)
        if page.status_code == requests.codes.ok:
            wp = self.getSoup()
            base_url = 'https://www.bezrealitky.cz'
            links = wp.findAll('div', {'product__body'})
            return [base_url + equity.find('a')['href'] for equity in links]
        
#         if next_page_text == 'Další >':
#             next_page_partial = wp.find('ul', class_="pagination justify-content-md-end").findAll('li')[-1].find('a')['href']
#             next_page_url = base_url + next_page_partial
#             getLinks(next_page_url)
#         # No more 'Next' pages, finish the script
#         else:
#             print('Done')
        
            
        

In [20]:
Downloader_single('https://www.bezrealitky.cz/vypis/nabidka-prodej/byt/praha?_token=qNLlM7SSvDXIFrdOKHnbj0gv_-OMf7vw6yUc_-ILcFo&page=63').getLinks()

[]

In [34]:
class Downloader_multi_byJan:
    '''
    Download all links real estate to initiate
    '''
    def __init__(self, link):
        '''
        Give link to extract links of given webpages
        '''
        self.link = link
    
    def getSoup(self):
            r = requests.get(link)
            r.encoding = 'UTF-8'
            return BeautifulSoup(r.text, 'lxml')

    def getSoupNext(link): 
            r = requests.get(link)
            r.encoding = 'UTF-8'
            return BeautifulSoup(r.text, 'lxml')

    def getAllPages(self):    
            wp = self.getSoup(link)
            base_url = 'https://www.bezrealitky.cz'
            test_element = wp.find('ul', class_="pagination justify-content-md-end").find('a', rel = 'next')
            pglinks = pd.Series(['https://www.bezrealitky.cz/vypis/nabidka-prodej/byt/praha'])
            while test_element != None:
                    partial_url = wp.find('ul', class_="pagination justify-content-md-end").find('a', rel = 'next')['href']
                    next_url = base_url + partial_url
                    pglinks = pglinks.append(pd.Series([next_url]), ignore_index = True)
                    wp_next = self.getSoupNext(next_url)
                    test_element = wp_next.find('ul', class_="pagination justify-content-md-end").find('a', rel = 'next')
                    wp = wp_next
            else: print("Loop is done")
            # return pglinks
        
    def getLinks(self):
            page = requests.get(link)
            if page.status_code == requests.codes.ok:
                for pg in pglinks:

                    wp = self.getSoup()
                    base_url = 'https://www.bezrealitky.cz'
                    eqlinks = wp.findAll('div', {'product__body'})
                    return [base_url_prague + equity.find('a')['href'] for equity in eqlinks]


In [110]:
Webpage = Downloader_multi_byJan('https://www.bezrealitky.cz/vypis/nabidka-prodej/byt/praha')

# Class using outer function
This class is using outside function for method helps

In [162]:
def getSoup(link):
        r = requests.get(link)
        r.encoding = 'UTF-8'
        return BeautifulSoup(r.text, 'lxml')

def getLinks(link):
        page = requests.get(link)
        if page.status_code == requests.codes.ok:
            wp = getSoup(link)
            base_url = 'https://www.bezrealitky.cz'
            links = wp.findAll('div', {'product__body'})
            return [base_url + equity.find('a')['href'] for equity in links]

class Downloader_multi_byTiep:
    '''
    Download all links of real estate properties on the given website
    '''
    def __init__(self, link):
        '''
        Provide real estate webpage to extract the properties links
        '''
        self.link = link

    def getAllPages(self):
            '''
            Get all possible pages from the provided website that contains links
            '''    
            wp = getSoup(self.link)
            base_url = 'https://www.bezrealitky.cz'
            test_element = wp.find('ul', class_="pagination justify-content-md-end").find('a', rel = 'next')
            links = pd.Series(['https://www.bezrealitky.cz/vypis/nabidka-prodej/byt/praha'])
            while test_element != None:
                    partial_url = wp.find('ul', class_="pagination justify-content-md-end").find('a', rel = 'next')['href']
                    next_url = base_url + partial_url
                    links = links.append(pd.Series([next_url]), ignore_index = True)
                    wp_next = getSoup(next_url)
                    test_element = wp_next.find('ul', class_="pagination justify-content-md-end").find('a', rel = 'next')
                    wp = wp_next
            else: print("Loop is done")
            return list(links)

In [163]:
links_class = Downloader_multi_byTiep('https://www.bezrealitky.cz/vypis/nabidka-prodej/byt/praha').getAllPages()

Loop is done


In [164]:
links_class

['https://www.bezrealitky.cz/vypis/nabidka-prodej/byt/praha',
 'https://www.bezrealitky.cz/vypis/nabidka-prodej/byt/praha?page=2',
 'https://www.bezrealitky.cz/vypis/nabidka-prodej/byt/praha?page=3',
 'https://www.bezrealitky.cz/vypis/nabidka-prodej/byt/praha?page=4',
 'https://www.bezrealitky.cz/vypis/nabidka-prodej/byt/praha?page=5',
 'https://www.bezrealitky.cz/vypis/nabidka-prodej/byt/praha?page=6',
 'https://www.bezrealitky.cz/vypis/nabidka-prodej/byt/praha?page=7',
 'https://www.bezrealitky.cz/vypis/nabidka-prodej/byt/praha?page=8',
 'https://www.bezrealitky.cz/vypis/nabidka-prodej/byt/praha?page=9',
 'https://www.bezrealitky.cz/vypis/nabidka-prodej/byt/praha?page=10',
 'https://www.bezrealitky.cz/vypis/nabidka-prodej/byt/praha?page=11',
 'https://www.bezrealitky.cz/vypis/nabidka-prodej/byt/praha?page=12',
 'https://www.bezrealitky.cz/vypis/nabidka-prodej/byt/praha?page=13',
 'https://www.bezrealitky.cz/vypis/nabidka-prodej/byt/praha?page=14',
 'https://www.bezrealitky.cz/vypis/n

In [168]:
links_properties = [getLinks(li) for li in links_class]

In [238]:
links_properties_unlist = list(itertools.chain(*links_properties))

In [246]:
np.shape(links_properties_unlist)

(620,)

<filter at 0x1263174a8>

In [169]:
links_properties_test = getLinks(links_class[4])

In [172]:
links_properties_test

['https://www.bezrealitky.cz/nemovitosti-byty-domy/604357-nabidka-prodej-bytu-mantovska',
 'https://www.bezrealitky.cz/nemovitosti-byty-domy/604332-nabidka-prodej-bytu-spanielova-praha',
 'https://www.bezrealitky.cz/nemovitosti-byty-domy/597584-nabidka-prodej-bytu-v-remizku-praha',
 'https://www.bezrealitky.cz/nemovitosti-byty-domy/603606-nabidka-prodej-bytu-tulesicka',
 'https://www.bezrealitky.cz/nemovitosti-byty-domy/604353-nabidka-prodej-bytu-za-zameckem-praha',
 'https://www.bezrealitky.cz/nemovitosti-byty-domy/539248-nabidka-prodej-bytu',
 'https://www.bezrealitky.cz/nemovitosti-byty-domy/604154-nabidka-prodej-bytu-uprkova-klanovice',
 'https://www.bezrealitky.cz/nemovitosti-byty-domy/604305-nabidka-prodej-bytu-omska-praha',
 'https://www.bezrealitky.cz/nemovitosti-byty-domy/603550-nabidka-prodej-bytu-praha',
 'https://www.bezrealitky.cz/nemovitosti-byty-domy/597004-nabidka-prodej-bytu-nazovska']

In [215]:
df_complet = pd.DataFrame()
for li in links_properties_test:
    df_unit = HouseForSale(li).getDf()
    df_test = pd.concat([df_test, df_uni

SyntaxError: invalid syntax (<ipython-input-215-8e70010ce893>, line 7)

In [224]:
df_test
df_test.set_index(['lat','long'])

Unnamed: 0_level_0,Unnamed: 1_level_0,variable,value
lat,long,Unnamed: 2_level_1,Unnamed: 3_level_1
50.041711,14.569642,Internet:,
50.041711,14.569642,Číslo inzerátu:,604357
50.041711,14.569642,Dispozice:,2+kk
50.041711,14.569642,Plocha:,56 m²
50.041711,14.569642,Cena:,5.400.000 Kč
50.041711,14.569642,Město:,Praha
50.041711,14.569642,Městská část:,Horní Měcholupy
50.041711,14.569642,Typ vlastnictví:,Osobní
50.041711,14.569642,Typ budovy:,Panel
50.041711,14.569642,Novostavba:,Ano


In [239]:
df_complete_sale = pd.DataFrame()
for li in links_properties_unlist:
    df_unit = HouseForSale(li).getDf()
    df_complete_sale = pd.concat([df_complete_sale, df_unit])

ConnectionError: HTTPSConnectionPool(host='www.bezrealitky.czhttps', port=443): Max retries exceeded with url: //www.bezrealitky.cz/nove-bydleni/vzorova-nemovitost/716-rezidence-silver-port/26331-3-kk-91m2-se-zahradou-p-10 (Caused by NewConnectionError('<urllib3.connection.VerifiedHTTPSConnection object at 0x129b87358>: Failed to establish a new connection: [Errno 8] nodename nor servname provided, or not known'))

In [223]:
df_test.set_index(['lat','long']).index.value_counts()

(50.106458, 14.659418)     23
(50.078784, 14.48633)      22
(50.041711, 14.569642)     22
(50.058553, 14.282639)     21
(50.094149, 14.495124)     20
(50.030721, 14.369716)     18
(50.053638, 14.355187)     16
(50.06744, 14.302724)      16
(50.072871, 14.477979)     15
(50.0585874, 14.377194)    15
dtype: int64

## Tutorial for pagination
link - https://letslearnabout.net/python/beautiful-soup/how-to-get-the-next-page-on-beautiful-soup/#next-page

In [None]:
def parse_page(next_url):
    # HTTP GET requests
  page = requests.get(next_url)

  # Checking if we successfully fetched the URL
  if page.status_code == requests.codes.ok:
    bs = BeautifulSoup(page.text, 'lxml')

    check_no_results = bs.find('ul', class_="SearchResults").find('p')
    if check_no_results and check_no_results.text:
        print('Search returned no results.')
        return None

    # Fetching all items
    list_all_cd = bs.findAll('li', class_='ResultItem')

    
    for cd in list_all_cd:
      get_cd_attributes(cd)

    next_page_text = bs.find('ul', class_="SearchBreadcrumbs").findAll('li')[-1].text
    
    if next_page_text == 'Next':
        next_page_partial = bs.find('ul', class_="SearchBreadcrumbs").findAll(
            'li')[-1].find('a')['href']
        next_page_url = base_url + next_page_partial
        print(next_page_url)
        parse_page(next_page_url)
    # No more 'Next' pages, finish the script
    else:
        export_table_and_print(data)

parse_page(search_url)