# 1. ETL and data extraction

## 1.1 Extract data from HTML and copy raw data to Postgres

We split between rent, new development site and second-hand properties. 

## 1.2 Clean the raw extracted data for further analysis

In [16]:
# ETL lib
import re
import numpy as np
import pandas as pd
# Postgres DB
from sqlalchemy import create_engine
import psycopg2
# Geo
# learning https://geopy.readthedocs.io/en/stable/#mapbox
# learning https://towardsdatascience.com/geocode-with-python-161ec1e62b89
import geopandas as gpd # https://geopandas.readthedocs.io/en/latest/gallery/create_geopandas_from_pandas.html
import geopy as gpy
from geopy.geocoders import Nominatim, MapBox
from geopy.extra.rate_limiter import RateLimiter
# Visualization
from keplergl import KeplerGl



# quick function to pull data from the postgres engine
def pull_data(sql):
    """
    This function pull the data from our Postgres accepts a sql query
    """
    df = pd.read_sql(sql, con=engine)
    return df

# create Postgres engine
engine = create_engine('postgresql+psycopg2://{}:{}@{}:{}/postgres'
                           .format('manfredi',  # username
                                   'manfredi',  # password
                                   'localhost',  # host
                                   '5432'  # local port
                                   ), echo=False
                           )

sql = """
SELECT
    * 
FROM 
    public.immobiliare_sales_new_development
"""

# pull data
df_new_development =  pull_data(sql)
df_new_development.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 81 entries, 0 to 80
Data columns (total 9 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   id       81 non-null     object
 1   name     81 non-null     object
 2   summary  81 non-null     object
 3   price    81 non-null     object
 4   sqm      81 non-null     object
 5   rooms    81 non-null     object
 6   baths    81 non-null     object
 7   floors   81 non-null     object
 8   url      81 non-null     object
dtypes: object(9)
memory usage: 5.8+ KB


In [26]:
# identify general point location for apartment
mapbox_api = "pk.eyJ1IjoibWFuZnJlZGltaXJhdWxhIiwiYSI6ImNrbDNsa3o0czBsYXQzM21udHhyaDBma3cifQ.WbGnfKPvGjFECvNnBYyNWw"

# 1. we get the point location of the street
def get_street_lat_lon(df):   
    # 1 - conveneint function to delay between geocoding calls
    locator = MapBox(mapbox_api) 
    geocode = RateLimiter(locator.geocode, min_delay_seconds=1)
    # 2- - create location column
    df['location'] = df['name'].apply(geocode)
    # 3 - create longitude, laatitude and altitude from location column (returns tuple)
    df['point'] = df['location'].apply(lambda loc: tuple(loc.point) if loc else None)
    # 4 - split point column into latitude, longitude and altitude columns
    df[['latitude', 'longitude', 'altitude']] = pd.DataFrame(df['point'].tolist(), index=df.index)
    df['geometry'] = gpd.points_from_xy(df.longitude, df.latitude)
    
    return df

# 2. we get the median price based on location and sqm as a baseline for checking the truthfulness of the prices
def price_checker(df):
    
    # first transform the price to an integer
    df['price'] = df['price'].astype(int)
    
    
    median_price = df['price']
    if table_type == 'sales':
        if df['price'] <= 90000:
            df['price_check'] = 1
        else:
            df['price_check'] = 0
    elif table_type == 'rent':
        if df['price'] >= 7000:
            df['price_check'] = 1

            
def create_sqm_band(df):
    if df['sqm'] <= 70:
        df['sqm_band'] = 'less_than_70'
    elif df['sqm'] > 70 & df['sqm'] <= 120:
        df['sqm_band'] = '70-120'
    elif df['sqm'] > 120 & df['sqm'] <= 200:
        df['sqm_band'] = '120-200'
    else:
        df['sqm_band'] = '200+'
    return df

def create_price_band_rent(df):
    """
    Here we exclude arbitratly prices that could be outliers. This is based on domain knowledge and it's not dynamic.
    We assume that prices less than 200 EUR or higher than 5000 EUR are potential outliers for rental properties. 
    Next step: implement dynamically with sqm correlation
    """
    if df['price'] <= 200 or df['price'] >= 5000:
        df['potential_outlier'] = 1
    else:
        0
    return df

def create_price_band_sales(df):
    """
    Here we exclude arbitratly prices that could be outliers. This is based on domain knowledge and it's not dynamic.
    We assume that prices less than 70000 EUR or higher than 700000 EUR are potential outliers for rental properties. 
    Next step: implement dynamically with sqm correlation
    """
    if df['price'] <= 70000 or df['price'] >= 700000:
        df['potential_outlier'] = 1
    else:
        0
    
    return df
    

In [17]:
# 1 - conveneint function to delay between geocoding calls
locator = MapBox(mapbox_api) 
geocode = RateLimiter(locator.geocode, min_delay_seconds=1)
# 2- - create location column
df_new_development['location'] = df_new_development['name'].apply(geocode)
# 3 - create longitude, laatitude and altitude from location column (returns tuple)
df_new_development['point'] = df_new_development['location'].apply(lambda loc: tuple(loc.point) if loc else None)
# 4 - split point column into latitude, longitude and altitude columns
df_new_development[['latitude', 'longitude', 'altitude']] = pd.DataFrame(df_new_development['point'].tolist(), index=df_new_development.index)
    

In [28]:
df_new_development.head()


Unnamed: 0,id,name,summary,price,sqm,rooms,baths,floors,url,location,point,latitude,longitude,altitude,geometry
0,AtticoviaSantoStefano91SantoStefanoBologna_570...,"Attico via Santo Stefano 91, Santo Stefano, Bo...",Centro Storico Via SANTO STEFANO 91 - COMPLETA...,570000,108,3,2,2,https://www.immobiliare.it/annunci/85084456/,"(Via Santo Stefano 91, 40125 Bologna Metropoli...","(44.487351, 11.35277, 0.0)",44.487351,11.35277,0.0,POINT (11.35277 44.48735)
1,AppartamentoviaAudinot18-2SaragozzafuoriPortaB...,"Appartamento via Audinot 18-2, Saragozza fuor...",Rif: 30232 - Appartamento composto da: ingress...,558000,115,5,2,Pianorialzato,https://www.immobiliare.it/annunci/85672574/,"(Osteria Fuori Porta, Via della Beverara, 8, B...","(44.511095, 11.329733, 0.0)",44.511095,11.329733,0.0,POINT (11.32973 44.51109)
2,BilocaleviaAudinot18-2SaragozzafuoriPortaBolog...,"Bilocale via Audinot 18-2, Saragozza fuori Po...",Rif: 30230 - Appartamento composto da: soggior...,292000,60,2,1,Pianorialzato,https://www.immobiliare.it/annunci/85672578/,"(Osteria Fuori Porta, Via della Beverara, 8, B...","(44.511095, 11.329733, 0.0)",44.511095,11.329733,0.0,POINT (11.32973 44.51109)
3,AppartamentoviaAudinot18-2SaragozzafuoriPortaB...,"Appartamento via Audinot 18-2, Saragozza fuor...",Rif: 30227 - Appartamento composto da: ingress...,592000,118,5,2,1,https://www.immobiliare.it/annunci/85672580/,"(Osteria Fuori Porta, Via della Beverara, 8, B...","(44.511095, 11.329733, 0.0)",44.511095,11.329733,0.0,POINT (11.32973 44.51109)
4,QuadrilocaleviaAudinot18-2SaragozzafuoriPortaB...,"Quadrilocale via Audinot 18-2, Saragozza fuor...",Rif: 30223 - Appartamento composto da: soggior...,446000,86,4,2,2,https://www.immobiliare.it/annunci/85672582/,"(Osteria Fuori Porta, Via della Beverara, 8, B...","(44.511095, 11.329733, 0.0)",44.511095,11.329733,0.0,POINT (11.32973 44.51109)


In [29]:

df = df_new_development[['id', 'name', 'summary', 'price', 'sqm', 'rooms', 'baths', 'floors', 'url', 'location', 'latitude', 'longitude', 'geometry']]


In [30]:
df.drop

Unnamed: 0,id,name,summary,price,sqm,rooms,baths,floors,url,location,latitude,longitude,geometry
0,AtticoviaSantoStefano91SantoStefanoBologna_570...,"Attico via Santo Stefano 91, Santo Stefano, Bo...",Centro Storico Via SANTO STEFANO 91 - COMPLETA...,570000,108,3,2,2,https://www.immobiliare.it/annunci/85084456/,"(Via Santo Stefano 91, 40125 Bologna Metropoli...",44.487351,11.35277,POINT (11.35277 44.48735)
1,AppartamentoviaAudinot18-2SaragozzafuoriPortaB...,"Appartamento via Audinot 18-2, Saragozza fuor...",Rif: 30232 - Appartamento composto da: ingress...,558000,115,5,2,Pianorialzato,https://www.immobiliare.it/annunci/85672574/,"(Osteria Fuori Porta, Via della Beverara, 8, B...",44.511095,11.329733,POINT (11.32973 44.51109)
2,BilocaleviaAudinot18-2SaragozzafuoriPortaBolog...,"Bilocale via Audinot 18-2, Saragozza fuori Po...",Rif: 30230 - Appartamento composto da: soggior...,292000,60,2,1,Pianorialzato,https://www.immobiliare.it/annunci/85672578/,"(Osteria Fuori Porta, Via della Beverara, 8, B...",44.511095,11.329733,POINT (11.32973 44.51109)
3,AppartamentoviaAudinot18-2SaragozzafuoriPortaB...,"Appartamento via Audinot 18-2, Saragozza fuor...",Rif: 30227 - Appartamento composto da: ingress...,592000,118,5,2,1,https://www.immobiliare.it/annunci/85672580/,"(Osteria Fuori Porta, Via della Beverara, 8, B...",44.511095,11.329733,POINT (11.32973 44.51109)
4,QuadrilocaleviaAudinot18-2SaragozzafuoriPortaB...,"Quadrilocale via Audinot 18-2, Saragozza fuor...",Rif: 30223 - Appartamento composto da: soggior...,446000,86,4,2,2,https://www.immobiliare.it/annunci/85672582/,"(Osteria Fuori Porta, Via della Beverara, 8, B...",44.511095,11.329733,POINT (11.32973 44.51109)
