In [1]:
from isochrones import plot_isochrones
from immobiliare import immobiliare_scraper, immobiliare_html_to_df, initialize_db_extract_immobiliare, insert_immobiliare


# Goal of this notebook: Identify potential candidates for opportunities to buy a house in Bologna

Objectives: 
1. Track market, having in mind a 2-3 years horizon
2. Evaluate the important features and what represents a "good price" for the type of house needed

It is implied that the house needed could vary in the course of the time horizon. However, I'll start this analysis and the ETL having in mind some simple features to limit and narrow down the potential candidates.

Starting point, is the identiication of the area where to scrape potential oppportunities. We focus on Immobiliare.it as this represent a good source of apartments and houses. We are conscious that we might lose private transactions and we might need to add this later on. 

<img src = "search-area-sales.png"></img>

The area of search is wide but focused on the southern part of Bologna. 


# 1. ETL and data extraction

## 1.1 Extract data from HTML and copy raw data to Postgres

We split between rent, new development site and second-hand properties. 

In [2]:
# get the page numbers from the bottom of the first page
url_rent = 'https://www.immobiliare.it/ricerca.php?idCategoria=1&idContratto=2&idNazione=IT&criterio=rilevanza&ordine=desc&pag=1&vrt=44.489776,11.357544;44.494249,11.34675;44.495691,11.337407;44.493154,11.336368;44.491507,11.339624;44.486731,11.339628;44.490634,11.329837;44.494753,11.329349;44.498877,11.327549;44.501142,11.321222;44.501225,11.305553;44.49277,11.302361;44.484392,11.288604;44.474124,11.314179;44.474465,11.351611;44.479087,11.364605;44.489776,11.357544'
url_new_dev = 'https://www.immobiliare.it/ricerca_nc.php?idCategoria=6&idContratto=&idTipologia[]=&idRegione=&idProvincia=&idComune=&prezzoMinimo=&prezzoMassimo=&superficieMinima=&superficieMassima=120&bagni=&criterio=rilevanza&ordine=desc&vrt=44.489776,11.357544;44.494249,11.34675;44.495691,11.337407;44.493154,11.336368;44.491507,11.339624;44.486731,11.339628;44.490634,11.329837;44.494753,11.329349;44.498877,11.327549;44.501142,11.321222;44.501225,11.305553;44.49277,11.302361;44.484392,11.288604;44.474124,11.314179;44.474465,11.351611;44.479087,11.364605;44.489776,11.357544'
url_second_hand = 'https://www.immobiliare.it/ricerca.php?idCategoria=1&idContratto=1&idNazione=IT&criterio=rilevanza&ordine=desc&pag=1&vrt=44.489776,11.357544;44.494249,11.34675;44.495691,11.337407;44.493154,11.336368;44.491507,11.339624;44.486731,11.339628;44.490634,11.329837;44.494753,11.329349;44.498877,11.327549;44.501142,11.321222;44.501225,11.305553;44.49277,11.302361;44.484392,11.288604;44.474124,11.314179;44.474465,11.351611;44.479087,11.364605;44.489776,11.357544'


html_rent = immobiliare_scraper(url_rent, 'rent')
html_new_dev = immobiliare_scraper(url_new_dev, 'sales')
html_second_hand = immobiliare_scraper(url_second_hand, 'sales')

In [3]:
# df_new_dev = immobiliare_html_to_df(html_new_dev)
# df_second_hand = immobiliare_html_to_df(html_second_hand)


In [4]:
# we initialize the tables if they don't exists        
initialize_db_extract_immobiliare(html_rent, "immobiliare_rent")   
initialize_db_extract_immobiliare(html_new_dev, "immobiliare_sales_new_development")
initialize_db_extract_immobiliare(html_second_hand, "immobiliare_sales_second_hand")

The database is already initialized
The database is already initialized
The database is already initialized


In [5]:
# we insert new rows checking what is already present
insert_immobiliare(html_rent, "immobiliare_rent")
insert_immobiliare(html_second_hand, "immobiliare_sales_second_hand")
insert_immobiliare(html_new_dev, "immobiliare_sales_new_development")
            


Inserted 29 rows
Inserted 16 rows
Inserted 3 rows


## 1.2 Clean the raw extracted data for further analysis

In [48]:
# ETL lib
import re
import numpy as np
import pandas as pd
# Postgres DB
from sqlalchemy import create_engine
import psycopg2
# Geo
# learning https://geopy.readthedocs.io/en/stable/#mapbox
# learning https://towardsdatascience.com/geocode-with-python-161ec1e62b89
import geopandas as gpd # https://geopandas.readthedocs.io/en/latest/gallery/create_geopandas_from_pandas.html
import geopy as gpy
from geopy.geocoders import Nominatim, MapBox
from geopy.extra.rate_limiter import RateLimiter
# Visualization
from keplergl import KeplerGl



# quick function to pull data from the postgres engine
def pull_data(sql):
    """
    This function pull the data from our Postgres accepts a sql query
    """
    df = pd.read_sql(sql, con=engine)
    return df

# create Postgres engine
engine = create_engine('postgresql+psycopg2://{}:{}@{}:{}/postgres'
                           .format('manfredi',  # username
                                   'manfredi',  # password
                                   'localhost',  # host
                                   '5432'  # local port
                                   ), echo=False
                           )

sql = """
SELECT
    * 
FROM 
    public.immobiliare_sales_new_development
"""

# pull data
df_new_development =  pull_data(sql)
df_new_development.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 78 entries, 0 to 77
Data columns (total 9 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   id       78 non-null     object
 1   name     78 non-null     object
 2   summary  78 non-null     object
 3   price    78 non-null     object
 4   sqm      78 non-null     object
 5   rooms    78 non-null     object
 6   baths    78 non-null     object
 7   floors   78 non-null     object
 8   url      78 non-null     object
dtypes: object(9)
memory usage: 5.6+ KB


In [44]:
# identify general point location for apartment
mapbox_api = "pk.eyJ1IjoibWFuZnJlZGltaXJhdWxhIiwiYSI6ImNrbDNsa3o0czBsYXQzM21udHhyaDBma3cifQ.WbGnfKPvGjFECvNnBYyNWw"

# 1 - conveneint function to delay between geocoding calls
locator = MapBox(mapbox_api) 
geocode = RateLimiter(locator.geocode, min_delay_seconds=1)
# 2- - create location column
df_new_development['location'] = df_new_development['name'].apply(geocode)
# 3 - create longitude, laatitude and altitude from location column (returns tuple)
df_new_development['point'] = df_new_development['location'].apply(lambda loc: tuple(loc.point) if loc else None)
# 4 - split point column into latitude, longitude and altitude columns
df_new_development[['latitude', 'longitude', 'altitude']] = pd.DataFrame(df_new_development['point'].tolist(), index=df_new_development.index)

df_new_development.head()

Unnamed: 0,id,name,summary,price,sqm,rooms,baths,floors,url,location,point,latitude,longitude,altitude
0,AtticoviaSantoStefano91SantoStefanoBologna_570...,"Attico via Santo Stefano 91, Santo Stefano, Bo...",Centro Storico Via SANTO STEFANO 91 - COMPLETA...,570000,108,3,2,2,https://www.immobiliare.it/annunci/85084456/,"(Via Santo Stefano 91, 40125 Bologna Metropoli...","(44.487351, 11.35277, 0.0)",44.487351,11.35277,0.0
1,AppartamentoviaAudinot18-2SaragozzafuoriPortaB...,"Appartamento via Audinot 18-2, Saragozza fuor...",Rif: 30232 - Appartamento composto da: ingress...,558000,115,5,2,Pianorialzato,https://www.immobiliare.it/annunci/85672574/,"(Osteria Fuori Porta, Via della Beverara, 8, B...","(44.511095, 11.329733, 0.0)",44.511095,11.329733,0.0
2,BilocaleviaAudinot18-2SaragozzafuoriPortaBolog...,"Bilocale via Audinot 18-2, Saragozza fuori Po...",Rif: 30230 - Appartamento composto da: soggior...,292000,60,2,1,Pianorialzato,https://www.immobiliare.it/annunci/85672578/,"(Osteria Fuori Porta, Via della Beverara, 8, B...","(44.511095, 11.329733, 0.0)",44.511095,11.329733,0.0
3,AppartamentoviaAudinot18-2SaragozzafuoriPortaB...,"Appartamento via Audinot 18-2, Saragozza fuor...",Rif: 30227 - Appartamento composto da: ingress...,592000,118,5,2,1,https://www.immobiliare.it/annunci/85672580/,"(Osteria Fuori Porta, Via della Beverara, 8, B...","(44.511095, 11.329733, 0.0)",44.511095,11.329733,0.0
4,QuadrilocaleviaAudinot18-2SaragozzafuoriPortaB...,"Quadrilocale via Audinot 18-2, Saragozza fuor...",Rif: 30223 - Appartamento composto da: soggior...,446000,86,4,2,2,https://www.immobiliare.it/annunci/85672582/,"(Osteria Fuori Porta, Via della Beverara, 8, B...","(44.511095, 11.329733, 0.0)",44.511095,11.329733,0.0


In [58]:
# check wether the price makes sense

df_new_development[df_new_development['price'].str.len() > 6]

Unnamed: 0,id,name,summary,price,sqm,rooms,baths,floors,url


In [None]:
map_1 = KeplerGl()
map_1