In [1]:
from isochrones import plot_isochrones
from immobiliare import immobiliare_scraper, immobiliare_html_to_df, initialize_db_extract_immobiliare, insert_immobiliare


ImportError: cannot import name 'immobiliare_scraper'

# Goal of this notebook: Identify potential candidates for opportunities to buy a house in Bologna

Objectives: 
1. Track market, having in mind a 2-3 years horizon
2. Evaluate the important features and what represents a "good price" for the type of house needed

It is implied that the house needed could vary in the course of the time horizon. However, I'll start this analysis and the ETL having in mind some simple features to limit and narrow down the potential candidates.

Starting point, is the identiication of the area where to scrape potential oppportunities. We focus on Immobiliare.it as this represent a good source of apartments and houses. We are conscious that we might lose private transactions and we might need to add this later on. 

<img src = "search-area-sales.png"></img>

The area of search is wide but focused on the southern part of Bologna. 


# 1. ETL and data extraction

We split between new development site and second-hand properties. 

In [None]:
# HTML scrape lib
import requests
from bs4 import BeautifulSoup
from time import sleep
from random import randint
# ETL lib
import re
import numpy as np
import pandas as pd
# Postgres DB
from sqlalchemy import create_engine
import psycopg2

    


    

In [None]:
# get the page numbers from the bottom of the first page
url_new_dev = 'https://www.immobiliare.it/ricerca_nc.php?idCategoria=6&idContratto=&idTipologia[]=&idRegione=&idProvincia=&idComune=&prezzoMinimo=&prezzoMassimo=&superficieMinima=&superficieMassima=120&bagni=&criterio=rilevanza&ordine=desc&vrt=44.489776,11.357544;44.494249,11.34675;44.495691,11.337407;44.493154,11.336368;44.491507,11.339624;44.486731,11.339628;44.490634,11.329837;44.494753,11.329349;44.498877,11.327549;44.501142,11.321222;44.501225,11.305553;44.49277,11.302361;44.484392,11.288604;44.474124,11.314179;44.474465,11.351611;44.479087,11.364605;44.489776,11.357544'
url_second_hand = 'https://www.immobiliare.it/ricerca.php?idCategoria=1&idContratto=1&idNazione=IT&criterio=rilevanza&ordine=desc&pag=1&vrt=44.489776,11.357544;44.494249,11.34675;44.495691,11.337407;44.493154,11.336368;44.491507,11.339624;44.486731,11.339628;44.490634,11.329837;44.494753,11.329349;44.498877,11.327549;44.501142,11.321222;44.501225,11.305553;44.49277,11.302361;44.484392,11.288604;44.474124,11.314179;44.474465,11.351611;44.479087,11.364605;44.489776,11.357544'

html_new_dev = immobiliare_scraper(url_new_dev)
html_second_hand = immobiliare_scraper(url_second_hand)

In [None]:
df_new_dev = immobiliare_html_to_df(html_new_dev)
df_second_hand = immobiliare_html_to_df(html_second_hand)


In [None]:
        
initialize_db_extract_immobiliare(html_new_dev, "immobiliare_sales_new_development")

In [None]:

            
initialize_db_extract_immobiliare(html_second_hand, "immobiliare_sales_second_hand")

In [None]:
insert_immobiliare(html_second_hand, "immobiliare_sales_second_hand")

In [None]:
def initialize_db_extract_immobiliare(html_scraped, table_name):
    """
    This function extract the information based on a fixed HTML structure (valid as of 30Mar2021) and import it to a Postgres DB.
    This function is called only the first time, when the DB is not present. 
    """

    # extract info
    tmp = immobiliare_html_to_df(html_scraped)
    # we use sqlalchemy to load the data to Postgres
    engine = create_engine('postgresql+psycopg2://{}:{}@{}:{}/postgres'
                           .format('manfredi',  # username
                                   'manfredi',  # password
                                   'localhost',  # host
                                   '5432'  # local port
                                   ), echo=False
                           )

    # create a connection with the database, we use psycopg2 to create the table
    try:
        conn = psycopg2.connect(database="postgres", user="manfredi",
                                password="manfredi", host="localhost", port="5432")
    except:
        print("I am unable to connect to the database")

    cur = conn.cursor()

    cur.execute(
        "select * from information_schema.tables where table_name=%s", (table_name,))
    check = bool(cur.rowcount)

    if check == False:
        try:
            cur.execute(
                """
                DROP TABLE if EXISTS {};
                CREATE TABLE IF NOT EXISTS {} (
                    id text, 
                    name text,
                    summary text,
                    price text,
                    sqm text, 
                    rooms text,
                    baths text, 
                    floors text,
                    url text,
                    created_at timestamp without time zone NOT NULL DEFAULT NOW(),
                    updated_at timestamp without time zone DEFAULT NULL
                )
                WITH (
                        OIDS = FALSE
                        )
                        TABLESPACE pg_default;
                        ALTER TABLE {}
                        OWNER to manfredi;
                        CREATE INDEX {}_id ON {}(id, name, sqm, price);
                """.format(table_name,)
            )
        except:
            print("Something wrong happened")
        conn.commit()  # <--- makes sure the change is shown in the database
        conn.close()
        cur.close()

        # we load into Postgres table created
        tmp.to_sql(table_name, engine, if_exists='append', index=False,
                   chunksize=1000, method='multi')
        print('The table ' + str(table_name) + ' has been initialized with ' +
              str(len(tmp))+' rows')

    else:
        print('The database is already initialized')


In [None]:

            
initialize_db_extract_immobiliare(html_second_hand, "immobiliare_sales_second_hand")

In [None]:
insert_immobiliare(html_second_hand, "immobiliare_sales_second_hand")

In [None]:
        
initialize_db_extract_immobiliare(html_new_dev, "immobiliare_sales_new_development")