# Instructions

This colab notebook is meant for testing purposes. It is a copy of the original idealista scraping notebook but having the accessing-to-idealista part removed so it focuses mainly on the scraping part.

To understand it better, the original notebook does the following:
1. obtains access to the main area (Madrid)
2. retrieves a link of all the sub-areas (districts or even smaller areas)
3. go through each district results page to obtain a list of properties urls
4. access each property url, retrieves its html source code and dumps it into /raw_data/properties/ folder
5. then it scrapes the dumped data and elaborates the dataset

This notebook focuses on the 5th part which is the only one that does not need a scraperapi account or proxy whatsoever. Still, it is to be considered as scraping since it extracts data form html source code.



In [1]:
from google.colab import drive
drive.flush_and_unmount()
drive.mount('/content/drive', force_remount=True)

Drive not mounted, so nothing to flush and unmount.
Mounted at /content/drive


In [2]:
from bs4 import BeautifulSoup
import concurrent.futures
import pandas as pd
import numpy as np
import requests
import time
import json
import os
import re

In [3]:
raw_data_path = '/content/drive/MyDrive/Kschool_TFM/raw_data'

In [12]:
class idealista_scraper:

    def __init__(self, url:str, raw_data_path:str) -> None:
        self._IDEALISTA_HOSTNAME = 'https://www.idealista.com'
        self._url = url
        self.raw_data_path = raw_data_path

    def create_dataset(self, limit:int=0) -> None:
        '''
        Access the dumped html code saved as text files for all the properties and
        retrieve the properties' features from them. It creates a new class property "dataset"
        '''
        dumped_data_files = os.listdir(f'{self.raw_data_path}{os.sep}properties{os.sep}')

        # if dataset exists then this is an update so skip the files already processed
        if hasattr(self,'dataset'):
            dumped_data_files = [x for x in dumped_data_files if not (self.dataset['id']==x).any()]

        if limit > 0:
          dumped_data_files = dumped_data_files[:limit]

        with concurrent.futures.ProcessPoolExecutor() as executor:
            executor.map(self._get_single_property_data,dumped_data_files)

        self.dataset = pd.read_csv(f'{self.raw_data_path}{os.sep}idealista_dataset_testing.csv')

    def _get_single_property_data(self,prop_dumped_data_file_name:str) -> None:
        '''
        Used to retrieve single property data with parallel processing
        
        args:
        - prop_dumped_data_file_name: name of the text file to retrieve the data from

        '''
        property_id = prop_dumped_data_file_name

        with open(f'{self.raw_data_path}{os.sep}properties{os.sep}{property_id}','r') as f:
            soup = BeautifulSoup(f.read(),'html.parser')

        print(f'{property_id}: soup parsed')

        if soup.select('#notFoundWithSuggestions'):
            print(f'{property_id}: this property has been removed')
            return None

        try:
            utag_script = list(filter(lambda x: 'utag_data' in x.get_text(),soup.select('script')))[0]
            utag_data = json.loads(str(utag_script).split(';')[0].split(' ')[7])
        except:
            print(f'{property_id}: cannot retrieve data')
            return None

        property_details = soup.select_one('div.details-property')
        property_data = {
            'id':utag_data['ad']['id'],
            'propertyType':soup.select_one('.main-info .typology').text.strip().lower(),
            'title':soup.select_one('.main-info .txt-body').text.strip().lower(),
            'description': soup.select_one('div.comment').text,
            'locationId':utag_data['ad']['address']['locationId'],
            'price':utag_data['ad']['price'],
            'size':utag_data['ad']['characteristics']['constructedArea'],
            'hasParking':utag_data['ad']['characteristics'].get('hasParking',0), # if not exist, get 0
            'roomNumber':utag_data['ad']['characteristics']['roomNumber'],
            'bathNumber':utag_data['ad']['characteristics']['bathNumber'],
            'hasSwimmingPool':utag_data['ad']['characteristics'].get('hasSwimmingPool',0), # if not exist, get 0
            'hasTerrace':utag_data['ad']['characteristics'].get('hasTerrace',0), # if not exist, get 0
            'hasGarden':utag_data['ad']['characteristics'].get('hasGarden',0), # if not exist, get 0
            'hasLift':utag_data['ad']['characteristics'].get('hasLift',0), # if not exist, get 0
            'hasAirco':1 if 'aire acondicionado' in property_details.text.strip().lower() else 0,
            'hasFittedWardrobes':1 if 'armario empotrado' in property_details.text.strip().lower().replace('s','') else 0,
            'isGoodCondition':utag_data['ad']['condition']['isGoodCondition'],
            'isNeedsRenovating':utag_data['ad']['condition']['isNeedsRenovating'],
            'isNewDevelopment':utag_data['ad']['condition']['isNewDevelopment'],
            'energyCertification':utag_data['ad']['energyCertification']['type'],
            'featureTags': [x.get_text().strip() for x in soup.select('.info-features-tags')]
        }

        year_built = list(filter(lambda x: 'construido en' in x.get_text().lower(), property_details.select('li')))
        property_data['yearBuilt'] = year_built[0].get_text() if len(year_built)>0 else "no info"
        del year_built

        orientation = list(filter(lambda x: 'orientación ' in x.get_text().lower(), property_details.select('li')))
        property_data['orientation'] = orientation[0].get_text() if len(orientation)>0 else "no info"
        del orientation

        heatingData = list(filter(lambda x: 'calefacción' in x.get_text().lower(),property_details.select('li')))
        property_data['heatingType'] = heatingData[0].get_text() if heatingData else "no info"
        del heatingData

        if property_data['propertyType'] == 'piso':
            info_features = soup.select('.info-features > span')
            if [x for x in info_features if "interior" in x.get_text()]:
                property_data['interiorExterior'] = "interior"
            elif [x for x in info_features if "exterior" in x.get_text()]:
                property_data['interiorExterior'] = "exterior"
            else:
                property_data['interiorExterior'] = "no info"
            floor_info = [x for x in info_features if re.search("bajo|sótano|planta", x.get_text().lower())]
            if floor_info:
                property_data['floor'] = floor_info[0].select_one('span').get_text().lower().strip()
            else:
                property_data['floor'] = "no info"
        else:
            property_data['floor'] = property_data['interiorExterior'] = "does not apply"
        
        property_data_df = pd.DataFrame.from_dict(property_data,orient='index').T
        print(f'{property_id}: data converted to DF')

        header = not os.path.exists(f'{self.raw_data_path}{os.sep}idealista_dataset_testing.csv')
        property_data_df.to_csv(f'{self.raw_data_path}{os.sep}idealista_dataset_testing.csv', mode='a', index=False, header=header)

    def get_location_ids_mapper(self) -> dict:
        '''
        By default, every record in the dataset has de feature "location id" which
        is the website own id to identify the district. This function scrape and
        creates a new class property named "location_ids_mapper", a dict to map 
        location ids to district's names

        '''
        
        if os.path.exists(f'{self.raw_data_path}{os.sep}location_ids.json'):
            with open(f'{self.raw_data_path}{os.sep}location_ids.json','r') as f:
                location_ids_mapper = json.loads(f.read())

        return location_ids_mapper

    def full_scrape_testing(self, limit:int=5) -> None:
        '''
        Runs a full scrape of idelista properties. Firstly it will run get_areas_df(),
        then generate_properties_links_df() and get_properties_data(). It will then complete
        the dataset by running the get_location_ids_mapper() and applying the mapper to the 
        dataset. Finally, it will export the dataset calling the file "idealista_dataset.csv"

        Receives no arguments and returns None
        '''

        self.create_dataset(limit=limit)

        mapper = self.get_location_ids_mapper()
        self.dataset['locationId'] = self.dataset['locationId'].apply(lambda x: "-".join(x.split("-")[:8]))
        self.dataset['area_name'] = self.dataset['locationId'].map(mapper)
        self.dataset = self.dataset.drop('locationId',axis=1)

        self.dataset.to_csv(f'{self.raw_data_path}{os.sep}idealista_dataset_testing.csv',index=False)

In [None]:
url = 'https://www.idealista.com/venta-viviendas/madrid-madrid/'
scraper = idealista_scraper(url, raw_data_path)

scraper.full_scrape_testing(limit=10) # there are only 10 elements in the properties folder

In [14]:
scraper.dataset.head()

Unnamed: 0,id,propertyType,title,description,locationId,price,size,hasParking,roomNumber,bathNumber,...,isGoodCondition,isNeedsRenovating,isNewDevelopment,energyCertification,featureTags,yearBuilt,orientation,heatingType,floor,interiorExterior
0,455306,chalet,chalet pareado en venta en urb. puerta de hier...,\n\n\nEstate One ofrece casa maravillosa de 4 ...,0-EU-ES-28-07-001-079-09-003,1500000,453,1,6,6,...,1,0,0,inProcess,['Lujo'],Construido en 2005,no info,Calefacción individual: Gas natural,does not apply,does not apply
1,324313,piso,"piso en venta en calle alcala de guadaira, 26",\n\n\nPiso bajo a pie de calle.\n\n\n,0-EU-ES-28-07-001-079-13-003,144000,60,0,1,1,...,1,0,0,a,[],no info,Orientación norte,Calefacción individual: Bomba de frío/calor,exterior,bajo
2,397293,piso,"dúplex en venta en calle san benito, 36","\n\n\nPRECIOSO DUPLEX SEMINUEVO. Exterior, sol...",0-EU-ES-28-07-001-079-06-004,502000,152,1,3,3,...,1,0,0,inProcess,['Lujo'],Construido en 2003,Orientación sur,Calefacción individual,exterior,bajo
3,175162,piso,"piso en venta en calle garcia quintanilla, 4","\n\n\n""TIEMPO PARA EL DETALLE""TIME FOR DETAIL""...",0-EU-ES-28-07-001-079-15-004,795000,201,1,2,2,...,0,0,1,unknown,['Lujo'],no info,no info,no info,exterior,planta 3ª
