In [None]:
# Imports

import pandas as pd
import bs4
from bs4 import BeautifulSoup
import requests
import csv


In [None]:
# Setup version

print('pandas version: {}'.format(pd.__version__))
print('bs4 version: {}'.format(bs4.__version__))
print('requests version: {}'.format(requests.__version__))
print('csv version: {}'.format(csv.__version__))

In [None]:
def get_url(city, pages):
    
    """
    The get_url function returns a list of urls for the searched phrase and the number of pages
    
    :city: The name or expression of the city you're looking at for rent 
    :pages: The number of pages you want to scrape (< maximum number of web pages shown)
    
    """    
    
    template1= 'https://www.pararius.com/apartments/{}/page-{}'    
    urls= []
    for i in range(1,pages+1):
        url= template1.format(city, i)
        urls.append(url)
    return urls


In [None]:
def scrape_results(urls):
    
    """
    
    The scrape_results function loops over the urls and scrapes all real estate data    
    
    :links: list of urls generated by calling the get_url function
    
    """
    
    records= []
    
    for url in urls:
        page= requests.get(url)
        soup= BeautifulSoup(page.content, 'html.parser')
        items= soup.find_all('section', {'class':'listing-search-item'})
        template2= 'https://www.pararius.com{}'
        
        for item in items:
        
            title= item.find('a', {'class':'listing-search-item__link listing-search-item__link--title'}).text.strip()
            rent_price= item.find('div', {'class':'listing-search-item__price'}).text.strip().replace('per month','')[1:]
            adress= item.find('div', {'class':'listing-search-item__sub-title'}).text.strip()
            surface= item.find('li', {'class':'illustrated-features__item illustrated-features__item--surface-area'}).text.strip().replace('m²','')
            rooms= item.find('li', {'class':'illustrated-features__item illustrated-features__item--number-of-rooms'}).text.strip().replace('rooms','')
            
            try:
                interior_status= item.find('li', {'class': 'illustrated-features__item illustrated-features__item--interior'}).text.strip()
            except AttributeError:
                interior_status= 'Undefined'
                
            try:
                agency= item.find('div', class_='listing-search-item__info').text.strip()
            except AttributeError:
                agency='None'
        
            link= template2.format(item.a['href'])
            contact= scrape_contact(link)

            record= (title, adress, rent_price, surface, rooms, interior_status, agency, contact, link)
            records.append(record)
            
    with open("data/real_estate.csv", 'w', newline='', encoding='utf-8') as f:
        writer= csv.writer(f)
        writer.writerow(['Title', 'Address', 'Rent Price', 'Surface', 'Rooms', 'Interior Status', 'Agency', 'Contact', 'Link'])
        writer.writerows(records)
    
    return records


In [None]:
def scrape_contact(url):
    
    """
    
    To scrape the contact's details    
    
    :links: list of urls generated by calling the get_url function
    
    """    
    page= requests.get(url)
    soup= BeautifulSoup(page.content, 'html.parser')
    div= soup.find('section', class_='agent-summary')
    contact= div.find('div', class_='agent-summary__links').a['href'].replace('tel:','')
    
    return contact


In [None]:
# Let's try to scrape Amsterdams propreties

urls= get_url('amsterdam', 17)
records= scrape_results(urls)

In [None]:
df= pd.read_csv('data/real_estate.csv')
df

In [None]:
df.shape