In [1]:
import openrouteservice
import folium
from keplergl import KeplerGl


# 1. Find the sweet spot, based on 30min bike ride between office and city center

In [5]:

client = openrouteservice.Client(key='5b3ce3597851110001cf6248171a4636f30449aa9e446f0682f74ebb') # Specify your personal API key


In [14]:
loc_dict = {
    'first': { #kantar
        'location': [11.2607037, 44.4841799] #swapped lat,long -> long, lat
    }, 
    'second': { #due torri
        'location': [11.3445425, 44.4942094]
    }
}

# Set up folium map
map1 = folium.Map(tiles='Stamen Toner', location=([44.4942094, 11.3445425]), zoom_start=12)

# Request of isochrones with 25 minute bike ride.
params_iso = {'profile': 'cycling-regular',
              'intervals': [1800], # 1800/60 = 30 mins
              'segments': 1800,
              'attributes': ['total_pop'] # Get population count for isochrones
             }

for name, loc in loc_dict.items():
    params_iso['locations'] = [loc['location']] # Add apartment coords to request parameters
    loc['iso'] = client.isochrones(**params_iso) # Perform isochrone request
    folium.features.GeoJson(loc['iso']).add_to(map1) # Add GeoJson to map
    
    folium.map.Marker(list(reversed(loc['location'])), # reverse coords due to weird folium lat/lon syntax
                      icon=folium.Icon(color='lightgray',
                                        icon_color='#cc0000',
                                        icon='map-marker',
                                        prefix='fa',
                                       ),
                      popup=name,
                 ).add_to(map1) # Add apartment locations to map

map1


    




we use this sweet spot to trace the area where we look for the available apartments. We go to the website Immobiliare.it, draw the area of interest and scrape the website to take the information needed. 


# 2. Scrape Immobiliare.it using Beautiful Soup

Searching Immobiliare.it, we have the opportunity to draw a polygon directly on the map. We mimick the area found using isochrones and we obtain the url for the search result 

<img src = "search-area.png">

In [157]:
# using this learning content: https://realpython.com/beautiful-soup-web-scraper-python/
# using this learning content: https://learningactors.com/how-to-scrape-multiple-pages-of-a-website-using-a-python-web-scraper/
import requests 
from bs4 import BeautifulSoup
import pandas as pd
import re 
import numpy as np

from time import sleep
from random import randint

# this is the website after the search. the url seems to carry the lat/lon coords for the search area. this is useful
# we could teorethically use this in the future to change the dimensions of the search area
# there are multiple pages, so I want to scrape all the consecutive pages

# create variable to store page numbers to iterate over
pages = np.arange(1, 50, 1)
tmp_results = []
final_results = []

# iterate over multiple pages and get the request
for page in pages: 
    page = requests.get('https://www.immobiliare.it/ricerca.php?idCategoria=1&idContratto=2&idNazione=IT&criterio=rilevanza&ordine=desc&pag='+str(page)+'&vrt=44.483219,11.351624;44.502811,11.352997;44.528822,11.341324;44.522703,11.305618;44.521234,11.287079;44.491852,11.273003;44.484994,11.263046;44.463191,11.279869;44.447997,11.297035;44.474216,11.312828;44.473726,11.331367;44.469071,11.349907;44.483219,11.351624')
    # create a bs4 object read in html
    soup = BeautifulSoup(page.content, 'html.parser')
    sleep(randint(2,10))
    tmp_results.append(soup)

# iterate over pages and extract text
for ix, soup in enumerate(tmp_results):
    # here we get the content of each ads. We inspected the HTML structure here: https://webformatter.com/html
    final_results.append(soup.find_all( 'div', class_ = 'listing-item_body--content'))

# request the url
# url = 'https://www.immobiliare.it/ricerca.php?idCategoria=1&idContratto=2&idNazione=IT&criterio=rilevanza&ordine=desc&pag=1&vrt=44.483219,11.351624;44.502811,11.352997;44.528822,11.341324;44.522703,11.305618;44.521234,11.287079;44.491852,11.273003;44.484994,11.263046;44.463191,11.279869;44.447997,11.297035;44.474216,11.312828;44.473726,11.331367;44.469071,11.349907;44.483219,11.351624'
# page = requests.get(url)




In [240]:
# create tmp lists
name_lst, summary_lst, url_lst, price_lst, rooms_lst, sqm_lst, baths_lst, floor_lst = ([] for i in range(8))
dict_ = {}

for ix, page in enumerate(final_results):
    for elem in page:
        
        # Each elem is a new BeautifulSoup object.
        # You can use the same methods on it as you did before.

# ok        
#         #extract name of listing
#         name = elem.find_all('p', class_ ="titolo text-primary")
        
#         # copy name text and if not found default
#         for i, val in enumerate(name):
#             if val.find('a')['title'] is not None:
#                 name_lst.append(str(val.find('a')['title']))
#             else:
#                 name_lst.append("not_found")
#                 continue
        
        
#         # extract short summary
#         summary = elem.find_all('p', class_ ="descrizione__truncate")
#         for i, val in enumerate(summary):
#             if val.text is not None:
#                 summary_lst.append(val.text)
#             else: 
#                 summary_lst.append("not_found")
                
    
#         #extract the ad link
#         if elem.find('a')['href'] is not None:
#             url_lst.append(elem.find('a')['href'])
#         else: 
#             url_lst.append("not_found")
    
#         #extract the apt.price and trasnfrom to int. when not available we default ot 0
#         if elem.find('li', class_ = "lif__item lif__pricing") is not None:
#             price_lst.append(
#                 int(re.sub(r"[^a-zA-Z0-9]+", ' ', 
#                        elem.find('li', class_ = "lif__item lif__pricing").text.strip()).replace(" ", "")
#             )
#         )
#         else: 
#             price_lst.append(0)



        #extract apt features. the structure is rooms, sqm, baths, floor level
        for i in range(0, len(elem.find_all('span', class_ ="text-bold"))):
            if i == 0:
                if elem.find_all('span', class_ ="text-bold") is not None:
                    rooms_lst.append("num_"+str(elem.find_all('span', class_ ="text-bold")[i].text))
                else:
                    rooms_lst.append("num_0")
            
            if i == 1:
                if elem.find_all('span', class_ ="text-bold") is not None:
                    sqm_lst.append("num_"+str(elem.find_all('span', class_ ="text-bold")[i].text))
                else:
                    sqm_lst.append("num_0")
            
            if i == 2:
                if elem.find_all('span', class_ ="text-bold") is not None:
                    baths_lst.append("num_"+str(elem.find_all('span', class_ ="text-bold")[i].text.strip().replace(" ", "")))
                else:
                    baths_lst.append("num_0")
                
        # extract floor value
        if elem.find('abbr', class_ ="text-bold im-abbr") is not None:
            floor_lst.append(elem.find('abbr', class_ ="text-bold im-abbr").text.strip().replace(" ", ""))
        else: 
            floor_lst.append("num_0")

        
#         dict_['name'] = name_lst
#         dict_['summary'] = summary_lst
#         dict_['url'] = url_lst
#         dict_['price'] = price_lst
#         dict_['rooms'] = rooms_lst
#         dict_['sqm'] = sqm_lst
#         dict_['baths'] = baths_lst
#         dict_['floor'] = floor_lst
    

    



In [271]:
# create tmp lists
dict_ = {
    'name': [], 
    'summary': [],
    'url': [], 
    'price': [], 
    'rooms': [],
    'sqm': [],
    'baths': [], 
    'floors': []
}

for ix, page in enumerate(final_results):
    for elem in page:
        #extract name of listing
        name = elem.find_all('p', class_ ="titolo text-primary")
        
        # copy name text and if not found default
        for i, val in enumerate(name):
            if val.find('a')['title'] is not None:
                dict_['name'].append(str(val.find('a')['title']))
            else:
                dict_['name'].append("not_found")
                continue
        
        
        # extract short summary
        summary = elem.find_all('p', class_ ="descrizione__truncate")
        for i, val in enumerate(summary):
            if val.text is not None:
                dict_['summary'].append(val.text)
            else: 
                dict_['summary'].append("not_found")
                
    
        #extract the ad link
        if elem.find('a')['href'] is not None:
            dict_['url'].append(elem.find('a')['href'])
        else: 
            dict_['url'].append("not_found")
    
        #extract the apt.price and trasnfrom to int. when not available we default ot 0
        if elem.find('li', class_ = "lif__item lif__pricing") is not None:
            dict_['price'].append(
                int(re.sub(r"[^a-zA-Z0-9]+", ' ', 
                       elem.find('li', class_ = "lif__item lif__pricing").text.strip()).replace(" ", "")
            )
        )
        else: 
            dict_['price'].append(0)
            
        # extract floor value
        if elem.find('abbr', class_ ="text-bold im-abbr") is not None:
            dict_['floors'].append(elem.find('abbr', class_ ="text-bold im-abbr").text.strip().replace(" ", ""))
        else: 
            dict_['floors'].append("num_0")
            
        #extract apt features. the structure is rooms, sqm, baths, floor level
        for i in range(0, len(elem.find_all('span', class_ ="text-bold"))):
            if i == 0:
                if elem.find_all('span', class_ ="text-bold") is not None:
                    dict_['rooms'].append(elem.find_all('span', class_ ="text-bold")[i].text)
                else:
                    dict_['rooms'].append("num_0")
            
            if i == 1:
                if elem.find_all('span', class_ ="text-bold") is not None:
                    dict_['sqm'].append(elem.find_all('span', class_ ="text-bold")[i].text)
            
            if i == 2:
                if elem.find_all('span', class_ ="text-bold") is not None:
                    dict_['baths'].append(elem.find_all('span', class_ ="text-bold")[i].text.strip().replace(" ", ""))


In [272]:
df = pd.DataFrame(dict([ (k,pd.Series(v)) for k,v in dict_.items() ]))

In [273]:
df

Unnamed: 0,name,summary,url,price,rooms,sqm,baths,floors
0,"Mansarda via Andrea Costa 5, Costa, Bologna",Via Andrea Costa Ad.ze Porta Sant' Isaia a poc...,https://www.immobiliare.it/annunci/60836460/,1050,3,75,1,4
1,"Trilocale via Guido Reni, Centro Storico, Bologna",Rif: 2502 - CODICE DI RIFERIMENTO: 2502 Centro...,https://www.immobiliare.it/annunci/84255190/,1100,3,100,1,1
2,"Monolocale via de' Carbonesi 8, Centro Storico...",D'Azeglio (Adiacenze) – Monolocale Ristruttura...,https://www.immobiliare.it/annunci/86597554/,580,1,40,1,T
3,"Appartamento via Malta, Colli Saragozza, Bologna",Appartamento P.1° con quadruplo affaccio - 3 c...,https://www.immobiliare.it/annunci/86491670/,2000,5,170,2,1
4,"Mansarda piazza San Martino 4, Centro Storico,...",centro storico affittasi in palazzo d'epoca '7...,https://www.immobiliare.it/annunci/86137540/,720,2,50,1,3
...,...,...,...,...,...,...,...,...
1220,"Bilocale primo piano, Centro Storico, Bologna",Proponiamo in locazione in una delle strade pi...,https://www.immobiliare.it/annunci/86634318/,1200,2,85,,1
1221,"Quadrilocale buono stato, Marconi, Bologna",BOLOGNA - VIA AMENDOLA: Comodissimo al centro ...,https://www.immobiliare.it/annunci/83595537/,1700,4,18,,num_0
1222,"Bilocale nuovo, primo piano, Noce - Pescarola,...",BOLOGNA - La Santi Immobiliare dispone di appa...,https://www.immobiliare.it/annunci/85055230/,1200,2,35,,1
1223,"Bilocale buono stato, terzo piano, Marconi, Bo...",affittasi a studenti via dei mille ampia camer...,https://www.immobiliare.it/annunci/83762657/,600,2,,,3
