In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from enum import Enum, auto
from tqdm import tqdm
from datetime import date

In [2]:
class ResidenceType(Enum):
    MORADIA = auto(),
    APARTAMENTO = auto()

class Condition(Enum):
    RUINA = "Ruína"
    NOVO = "Novo"
    RENOVADO = "Renovado"
    USADO = "Usado"
    EM_CONSTRUCAO = "Em construção"
    PARA_RECUPERAR = "Para recuperar"

    

In [3]:
def get_regions():
    return [
        ("Aveiro", "1"),
        ("Beja", "2"),
        ("Braga", "3"),
        ("Bragança", "4"),
        ("Castelo Branco", "5"),
        ("Coimbra", "6"),
        ("Évora", "7"),
        ("Faro", "8"),
        ("Guarda", "9"),
        ("Ilha da Graciosa", "24"),
        ("Ilha da Madeira", "19"),
        ("Ilha das Flores", "28"),
        ("Ilha de Porto Santo", "20"),
        ("Ilha de Santa Maria", "21"),
        ("Ilha de São Jorge", "25"),
        ("Ilha de São Miguel", "22"),
        ("Ilha do Corvo", "29"),
        ("Ilha do Faial", "27"),
        ("Ilha do Pico", "26"),
        ("Ilha Terceira", "23"),
        ("Leiria", "10"),
        ("Lisboa", "11"),
        ("Portalegre", "12"),
        ("Porto", "13"),
        ("Santarém", "14"),
        ("Setúbal", "15"),
        ("Viana do Castelo", "16"),
        ("Vila Real", "17"),
        ("Viseu", "18"),
    ]

In [4]:
def request_page(url):
    request = requests.get(url, headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)"})
    soup = BeautifulSoup(request.content, "html.parser")

    return soup

In [5]:
def create_request_link(residence_type, region, page, service_type='arrendar'):
    space = " "
    dash = "-"
    return (
        f"https://www.imovirtual.com/arrendar/{residence_type}"
        + f"/{region[0].lower().replace(space,dash)}"
        + f"/?search%5Bregion_id%5D={region[1]}&nrAdsPerPage=72&page={page}"
    )

In [6]:
def get_num_pages(soup):
    try:
        page_list = soup.find('ul', attrs={'class': 'pager'})
        total_pages = page_list.find_all('li')[-2].text
    except (AttributeError, TypeError):
        total_pages = 1
    return int(total_pages)

In [7]:
def get_attribute_safe(element,name, attribute, default='NA'):
    try:
        if name == 'img':
            return element.find(name, attribute, alt=True)['alt']
        else:
            return element.find(name, attribute).text
    except (AttributeError, TypeError):
        return default


In [8]:
def get_infos(soup):
    list_ads = soup.find_all('article')
    ads = []
    for ad in list_ads:
        price = get_attribute_safe(ad, 'li', {'class': 'offer-item-price'}).strip().split('€')[0]
        energy_certify = get_attribute_safe(ad, 'div', {'class': 'energy-certify'})
        metric= get_attribute_safe(ad, 'strong', {'class': 'visible-xs-block'})
        description = get_attribute_safe(ad, 'span', {'class': 'offer-item-title'})
        location = get_attribute_safe(ad, 'p', {'class': 'text-nowrap'})
        rooms = get_attribute_safe(ad, 'li', {'class': 'offer-item-rooms hidden-xs'})
        details = get_attribute_safe(ad, 'ul', {'class': 'params-small clearfix hidden-xs'})
        company = get_attribute_safe(ad, 'img', {'company-logo lazy'})



        ads.append({
            'price': price.strip(),
            'energy_certify': energy_certify.strip(),
            'metric': metric.strip(),
            'description': description.strip(),
            'location': location.strip().split(":")[1].strip(),
            'rooms': rooms.strip(),
            'details': details.strip(),
            'company': company.strip()
        })

    return ads

In [9]:
def detail_extract(df):
    temp = df.copy()
    temp.loc[temp[temp.details.str.contains('Anúncio')].index, 'company'] = 'Anúncio Particular'
    temp['bathroom'] = temp.details.astype('str').str.extractall('(\d+)').unstack().fillna('').sum(axis=1).astype(int)
    temp['condition'] = temp['details'].str.extract(f"({'|'.join([condition.value for condition in Condition])})")
    temp = temp.drop(['details'], axis=1)
    
    
    return temp

In [10]:
def run():
    df = pd.DataFrame()
    regions = get_regions()
    for residence in ResidenceType:
        for region in tqdm(regions, desc=f'{residence}'):
            url = create_request_link(residence_type=residence.name.lower(), region=region, page="1")
            soup = request_page(url)
            pages = get_num_pages(soup)

            for page in range(1, pages+1):
                url = create_request_link(residence_type=residence.name.lower(), region=region, page=page)
                soup = request_page(url)
                ads = get_infos(soup)

                temp = pd.DataFrame(ads)
                temp['property_type'] = residence.name.lower()
                temp['district'] = region[0]
                df = pd.concat([df,temp])
                
            
    df['extract_date'] = date.today()
    df['extract_date'] = pd.to_datetime(df['extract_date'])
    return df.reset_index(drop=True)

In [11]:
df = run()

ResidenceType.MORADIA: 100%|███████████████████████████████████████████████████████████| 29/29 [00:38<00:00,  1.31s/it]
ResidenceType.APARTAMENTO: 100%|███████████████████████████████████████████████████████| 29/29 [01:07<00:00,  2.34s/it]


In [12]:
df = detail_extract(df)
print(df.shape)
df.head()

(3043, 12)


Unnamed: 0,price,energy_certify,metric,description,location,rooms,company,property_type,district,extract_date,bathroom,condition
0,900,E,124 m²,Andar de Moradia T3| Arrendamento| Cacia,"Cacia, Aveiro",T3,Zome Viva Aveiro,moradia,Aveiro,2023-07-03,2.0,Usado
1,2 000,D,147 m²,Moradia T4 para arrendamento na praia de Corte...,"Cortegaça, Ovar, Aveiro",T4,A Casa é Sua,moradia,Aveiro,2023-07-03,4.0,Usado
2,5 000,F,300 m²,Moradia T2 para arrendamento,"Mozelos, Santa Maria da Feira, Aveiro",T2,RE/MAX + Grupo Vantagem,moradia,Aveiro,2023-07-03,1.0,
3,900,E,108 m²,"Moradia V3, condomínio privado, para arrendar ...","Santa Maria da Feira, Travanca, Sanfins e Espa...",T3,Chave Nova - S.M.Feira,moradia,Aveiro,2023-07-03,3.0,Usado
4,1 200,E,134 m²,Moradia V4 no centro da Feira,"Santa Maria da Feira, Travanca, Sanfins e Espa...",T4,Imo2007,moradia,Aveiro,2023-07-03,2.0,Usado


In [24]:
df2 = pd.read_parquet('../data/raw/imovirtual.parquet')
print(df2.shape)

(3062, 12)


In [25]:
df3 = pd.concat([df, df2])
print(df3.shape)

(6105, 12)


In [26]:
df3.duplicated().sum()

31

In [27]:
df3 = df3.drop_duplicates().reset_index(drop=True)

In [28]:
df3.head()

Unnamed: 0,price,energy_certify,metric,description,location,rooms,company,property_type,district,extract_date,bathroom,condition
0,900,E,124 m²,Andar de Moradia T3| Arrendamento| Cacia,"Cacia, Aveiro",T3,Zome Viva Aveiro,moradia,Aveiro,2023-07-03,2.0,Usado
1,2 000,D,147 m²,Moradia T4 para arrendamento na praia de Corte...,"Cortegaça, Ovar, Aveiro",T4,A Casa é Sua,moradia,Aveiro,2023-07-03,4.0,Usado
2,5 000,F,300 m²,Moradia T2 para arrendamento,"Mozelos, Santa Maria da Feira, Aveiro",T2,RE/MAX + Grupo Vantagem,moradia,Aveiro,2023-07-03,1.0,
3,900,E,108 m²,"Moradia V3, condomínio privado, para arrendar ...","Santa Maria da Feira, Travanca, Sanfins e Espa...",T3,Chave Nova - S.M.Feira,moradia,Aveiro,2023-07-03,3.0,Usado
4,1 200,E,134 m²,Moradia V4 no centro da Feira,"Santa Maria da Feira, Travanca, Sanfins e Espa...",T4,Imo2007,moradia,Aveiro,2023-07-03,2.0,Usado


In [36]:
df3 = df3.drop(['extract_date'], axis=1).drop_duplicates()
df3.shape

(3757, 11)

In [39]:
df3.sort_values(['price','energy_certify','metric', 'description'])

Unnamed: 0,price,energy_certify,metric,description,location,rooms,company,property_type,district,bathroom,condition
523,1 000,A,120 m²,Aluga-se moradia,"Mouçós e Lamares, Vila Real",T2,OLX,moradia,Vila Real,1.0,Usado
3644,1 000,A,120 m²,Apartamento T2 para arrendar Avenida da Liberd...,Braga (São José de São Lázaro e São João do So...,T2,,apartamento,Braga,2.0,Novo
419,1 000,A,130 m²,"Moradia nova T2+1 , zona de Rio Tinto","Rio Tinto, Gondomar, Porto",T2,OLX,moradia,Porto,3.0,Novo
835,1 000,A,180 m²,T5 Celas + garagem,"Santo António dos Olivais, Coimbra",T5,OLX,apartamento,Coimbra,2.0,Renovado
169,1 000,A,300 m²,Casa para alugar,"Peniche, Leiria",T2,OLX,moradia,Leiria,1.0,Renovado
...,...,...,...,...,...,...,...,...,...,...,...
1022,Preço sob consulta,E,55 m²,Arrenda-se moradia com apartamentos para férias,"Peniche, Leiria",T1,,apartamento,Leiria,1.0,Usado
1039,Preço sob consulta,E,60 m²,Apartamento T1 para arrendamento de férias,"Peniche, Leiria",T1,,apartamento,Leiria,,Usado
126,Preço sob consulta,E,80 m²,Moradia T1+1 em Tavira,"Tavira (Santa Maria e Santiago), Tavira, Faro",T1,"WORK DIGITAL SUN, UNIPESSOAL, LDA",moradia,Faro,,Renovado
16,Preço sob consulta,F,40 m²,T0 para arrendar em Oliveira de Azemeis,"Oliveira de Azeméis, Santiago de Riba-Ul, Ul, ...",T0,,moradia,Aveiro,1.0,Renovado


In [13]:
df.to_parquet('../data/raw/imovirtual.parquet')