In [145]:
%%time

from selenium.webdriver.common.keys import Keys
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.action_chains import ActionChains

chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')
chrome_options.add_argument('--enable-javascript')
chrome_options.add_argument('user-agent=Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.181 Safari/537.36')

from bs4 import BeautifulSoup
import requests
import re
import time
import pandas as pd
from lxml import etree, html
import json


class fotocasa_scraping:
  '''
  Clase para inicializar el scraping de Fotocasa.com
  '''

  def check_features(self, data):
    '''
    Comprobaciones de features para lectura correcta
    '''    
    # Diccionario de features
    realestate = {
        'title': '',
        'link': '',
        'image_url': '',
        'country': '',
        'district': '',
        'neighborhood': '',
        'street': '',
        'zipCode': '',
        'province': '',
        'buildingType': '',
        'clientAlias': '',
        'latitude': '',
        'longitude': '',
        'isNewConstruction': '',
        'rooms': '',
        'bathrooms': '',
        'parking': '',
        'elevator': '',
        'furnished': '',
        'surface': '',
        'energyCertificate': '',
        'hotWater': '',
        'heating': '',
        'conservationState': '',
        'antiquity': '',
        'floor': '',
        'surfaceLand': '',
        'otherFeatures': '',
        'price': '',     
        }
    # Comienzan las comprobaciones feature a feature
    try:
        realestate['title'] = data['propertyTitle']    
    except:
        realestate['title'] = '^'
        
    try:
        realestate['link'] = 'https://www.fotocasa.es' + data['realEstate']['detail']['es-ES']   
    except:
        realestate['link'] = '^'

    try:
        realestate['image_url'] = data['realEstate']['multimedia'][1]['src']
    except:
        realestate['image_url'] = '^'
        
    try:
        realestate['country'] = data['realEstate']['address']['country']
    except:
        realestate['country'] = '^'
        
    try:
        realestate['district'] = data['realEstate']['address']['district']
    except:
        realestate['district'] = '^'
        
    try:
        realestate['neighborhood'] = data['realEstate']['address']['neighborhood']
    except:
        realestate['neighborhood'] = '^'
        
    try:
        realestate['street'] = data['realEstate']['location']
    except:
        realestate['street'] = '^'
        
    try:
        realestate['zipCode'] = data['realEstate']['address']['zipCode']
    except:
        realestate['zipCode'] = '^'
        
    try:
        realestate['province'] = data['realEstate']['address']['province']
    except:
        realestate['province'] = '^'
        
    try:
        realestate['buildingType'] = data['realEstate']['buildingType']
    except:
        realestate['buildingType'] = '^'

    try:
        realestate['clientAlias'] = data['realEstate']['clientAlias']
    except:
        realestate['clientAlias'] = '^'
        
    try:
        realestate['latitude'] = data['realEstate']['coordinates']['latitude']
    except:
        realestate['latitude'] = '^'

    try:
        realestate['longitude'] = data['realEstate']['coordinates']['longitude']
    except:
        realestate['longitude'] = '^'
        
    try:
        realestate['isNewConstruction'] = data['realEstate']['isNewConstruction']
    except:
        realestate['isNewConstruction'] = '^'
        
    try:
        realestate['rooms'] = data['realEstate']['features']['rooms']
    except:
        realestate['rooms'] = '^'
        
    try:
        realestate['bathrooms'] = data['realEstate']['features']['bathrooms']
    except:
        realestate['bathrooms'] = '^'

    try:
        featureList = data['realEstate']['featuresList']
        realestate['parking'] = ''.join([featureList[index]['value'] for index,value in enumerate(featureList) if featureList[index]['label'] == 'parking'])
        
    except:
        realestate['parking'] = '^'

    try:
        featureList = data['realEstate']['featuresList']
        realestate['elevator'] = ''.join([featureList[index]['value'] for index,value in enumerate(featureList) if featureList[index]['label'] == 'elevator'])
        
    except:
        realestate['elevator'] = '^'

    try:
        featureList = data['realEstate']['featuresList']
        realestate['furnished'] = ''.join([featureList[index]['value'] for index,value in enumerate(featureList) if featureList[index]['label'] == 'furnished'])
        
    except:
        realestate['furnished'] = '^'
        
    try:
        realestate['surface'] = data['realEstate']['features']['surface']
    except:
        realestate['surface'] = '^'
        
    try:
        realestate['energyCertificate'] = data['realEstate']['energyCertificate']
    except:
        realestate['energyCertificate'] = '^'
        
    try:
        realestate['hotWater'] = data['realEstate']['features']['hotWater']
        featureList = data['realEstate']['featuresList']
        realestate['hotWater'] = ''.join([featureList[index]['value'] for index,value in enumerate(featureList) if featureList[index]['label'] == 'hotWater'])
        
    except:
        realestate['hotWater'] = '^'
        
    try:
        realestate['heating'] = data['realEstate']['features']['heating']
        featureList = data['realEstate']['featuresList']
        realestate['heating'] = ''.join([featureList[index]['value'] for index,value in enumerate(featureList) if featureList[index]['label'] == 'heating'])
       
    except:
        realestate['heating'] = '^'
        
    try:
        realestate['conservationState'] = data['realEstate']['features']['conservationState']
        featureList = data['realEstate']['featuresList']
        realestate['conservationState'] = ''.join([featureList[index]['value'] for index,value in enumerate(featureList) if featureList[index]['label'] == 'conservationState'])
       
    except:
        realestate['conservationState'] = '^'
        
    try:
        realestate['antiquity'] = data['realEstate']['features']['antiquity']
        featureList = data['realEstate']['featuresList']
        realestate['antiquity'] = ''.join([featureList[index]['value'] for index,value in enumerate(featureList) if featureList[index]['label'] == 'antiquity'])
       
    except:
        realestate['antiquity'] = '^'
        
    try:
        realestate['floor'] = data['realEstate']['features']['floor']
    except:
        realestate['floor'] = '^'
        
    try:
        realestate['surfaceLand'] = data['realEstate']['features']['surfaceLand']
    except:
        realestate['surfaceLand'] = '^'
        
    try:
        realestate['otherFeatures'] = data['realEstate']['otherFeatures']
    except:
        realestate['otherFeatures'] = '^'
        
    try:
        realestate['price'] = data['realEstate']['price']
    except:
        realestate['price'] = 0
        
    #devuelve un diccionario
    return realestate

  def parse_properties(self, driver, url_list):
    # Recibo una lista de urls de la propiedad y lo separo en un diccionario. Devuelvo el diccionario.
    
    df_page = pd.DataFrame()
    
    for url in url_list:
        driver.get(url)
        html_txt = driver.page_source
        soup = BeautifulSoup(html_txt,'html.parser')
        prop_scripts = soup.findAll('script')
        prop_features = ''.join([re.search('window.__INITIAL_PROPS__ = JSON.parse(.*)\n',str(x)).group(1) for x in prop_scripts if re.search('window.__INITIAL_PROPS__',str(x))])
        prop_features_clean = re.sub(r'\\"','"',prop_features)
        prop_features_clean = re.sub(r'\\\\"','',prop_features_clean)
        prop_features_clean = re.sub(r'\("|"\);','',prop_features_clean)
        prop_features_clean = re.sub(r',"seo":.*','}',prop_features_clean)
        try:
            prop_data = json.loads(prop_features_clean)
            realestate = self.check_features(prop_data)
            
        except:
            print('Error: ' + url + '\n' + str(prop_features_clean))
            
        #posibilidad de crear una función de comprobación de los campos para asignar valores default en caso de que no existan
        df = pd.DataFrame([realestate])
        df_page = pd.concat([df_page,df],ignore_index=True)
        
    return df_page

  def property_list(self, driver, city, page):
    # Recibo un número de página. Almaceno las urls de todas las propiedades de cada página de parrilla. Devuelvo una lista de urls.
    driver.get('https://www.fotocasa.es/es/comprar/viviendas/' + city.lower() + '-provincia/todas-las-zonas/l' + '/' + str(page))

    # Creación de lista de article (propiedades) del grid
    properties_grid = []
    properties_grid_alternative = []
    l_now = 0
    l_before = -1
    while len(properties_grid)+len(properties_grid_alternative) < 30: # número de tarjetas de propiedad por página
        html_txt = driver.page_source
        soup = BeautifulSoup(html_txt,'html.parser')
        #properties_grid = soup.findAll('article')
        l_now = len(properties_grid)
        
        if(l_before < l_now):
            properties_grid = soup.findAll('article')
        else:
            properties_grid_alternative = soup.findAll('div', attrs={'class': re.compile(r'^re-CardPack.*-info.*')})
            
        l_before = len(properties_grid)
        
        
        ActionChains(driver).key_down(Keys.PAGE_DOWN).key_up(Keys.PAGE_DOWN).perform()
        time.sleep(0.2)
    
    # Creación de lista con urls de todas las propiedades del grid
    property_url_list = []
    
    if(len(properties_grid_alternative) > 0):
        properties_grid = properties_grid + properties_grid_alternative
        #print(str(len(properties_grid)))
        #print(properties_grid)
        
    domain = 'https://www.fotocasa.es'
    for article in properties_grid:
        prop = etree.HTML(str(article))
        try:
            url = prop.xpath('//a[contains(@class, "info-container") or contains(@class, "carousel") or contains(@class, "slider")]//@href')[0]
            property_url_list.append(f'{domain}{url}')
        except:
            pass


    return property_url_list

  def pages_to_scrape(self, driver, city):
    # Obtengo el número de páginas totales que debo recorrer. Devuelvo un entero.

    driver.get('https://www.fotocasa.es/es/comprar/viviendas/' + city.lower() + '-provincia/todas-las-zonas/l')
    
    page_selector = []
    while len(page_selector) < 1:
        html_txt = driver.page_source
        soup = BeautifulSoup(html_txt,'html.parser')
        page_selector = soup.findAll('li',attrs={'class':'sui-MoleculePagination-item'})
        ActionChains(driver).key_down(Keys.PAGE_DOWN).perform()
        time.sleep(0.2)
       
    n_pages = re.search('<span class="sui-AtomButton-inner">(.*)</span>',str(page_selector[-2])).group(1)
    
    print('Pages to scrape: ' + str(n_pages))
    
    return int(n_pages)

  def __init__(self, city='Madrid'):
    '''
    Inicio de la clase con el scraping de Fotocasa para la ciudad indicada como parámetro.
    Por defecto: Madrid
    '''
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()),options=chrome_options)
    n_pages = self.pages_to_scrape(driver, city)
    #n_pages = 308
    self.data = pd.DataFrame()
    
    for page in range(1,n_pages): #restamos la página final que estará incompleta
        properties_per_page = self.property_list(driver, city, page)
        property_data = self.parse_properties(driver, properties_per_page)
        self.data = pd.concat([self.data,property_data], ignore_index=True)
        print(str(page))
        
        if (page == 50 or page == 100 or page == 150 or page == 200 or page == 250 or page == 300 or page == 350 or page == 400 or page == 450 or page == 500 or page == 550 or page == 600 or page == 650 or page == 700 or page == 750 or page == 800 or page == 850 or page == 900 or page == 950):
            print('Scraped pages: ' + str(page))
            self.data.to_csv('scraped_' + str(page) + '.csv',index=False)
        
    # Leer página inicial de la parrilla de la ciudad enviada por parámetro. OK
    # Obtener el número de páginas totales a recorrer. OK
    # Almacenar todas las urls de propiedades que debo recorrer OK
    # Meterme en cada url almacenada y scrapear window.__INITIAL_PROPS__ OK
    # Limpiar el JSON de cada propiedad OK
    # Cambiar de página y comenzar el proceso iterativo recorriendo todas las propiedades de cada página. OK
    self.data.to_csv('allscraped_' + str(n_pages) + '.csv',index=False)
    return None

CPU times: user 72 µs, sys: 11 µs, total: 83 µs
Wall time: 84.9 µs


In [146]:
%%time
ft_scraping = fotocasa_scraping()


Pages to scrape: 964
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
Scraped pages: 50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
Scraped pages: 100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
Scraped pages: 150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
Error: https://www.fotocasa.es/es/comprar/vivienda/obra-nueva/madrid-capital/19624728/162619010?from=list

176
177
178
179
180
181
182
183
184
185
186
Error: https://www.fotocasa.es/es/comprar/vivienda/obra-nueva/madrid-capital/19572296/153738216?from=list

187
188
189
190
191
192
193
194
195
196
197
198
199
200
Scraped pages: 2

In [147]:
ft_scraping.data.shape

(28944, 29)

In [148]:
ft_scraping.data.head(10)

Unnamed: 0,title,link,image_url,country,district,neighborhood,street,zipCode,province,buildingType,...,surface,energyCertificate,hotWater,heating,conservationState,antiquity,floor,surfaceLand,otherFeatures,price
0,Piso en venta en Monte Esquinza,https://www.fotocasa.es/es/comprar/vivienda/ma...,https://static.inmofactory.com/images/inmofact...,España,Chamberí,Almagro,Monte Esquinza,28010,Madrid,Flat,...,233,G,Gas Natural,Gas Natural,Casi nuevo,30 a 50 años,0,0,"{'1': 'Aire acondicionado', '2': 'Armarios', '...",1690000
1,Piso en venta,https://www.fotocasa.es/es/comprar/vivienda/ma...,https://static.inmofactory.com/images/inmofact...,España,Centro,Embajadores - Lavapiés,,28012,Madrid,Flat,...,83,^,,,Muy bien,+ 100 años,7,0,"{'3': 'Calefacción', '18': 'Suite - con baño',...",424500
2,Piso en venta en Monte Esquinza,https://www.fotocasa.es/es/comprar/vivienda/ma...,https://static.inmofactory.com/images/inmofact...,España,Chamberí,Almagro,Monte Esquinza,28010,Madrid,Flat,...,233,G,Gas Natural,Gas Natural,Casi nuevo,30 a 50 años,0,0,"{'1': 'Aire acondicionado', '2': 'Armarios', '...",1690000
3,Piso en venta,https://www.fotocasa.es/es/comprar/vivienda/ma...,https://static.inmofactory.com/images/inmofact...,España,Centro,Embajadores - Lavapiés,,28012,Madrid,Flat,...,83,^,,,Muy bien,+ 100 años,7,0,"{'3': 'Calefacción', '18': 'Suite - con baño',...",424500
4,Piso en venta,https://www.fotocasa.es/es/comprar/vivienda/ma...,https://static.inmofactory.com/images/inmofact...,España,Chamberí,Almagro,,28010,Madrid,Flat,...,55,E,,,Muy bien,70 a 100 años,0,0,"{'3': 'Calefacción', '119': 'Ascensor interior...",320000
5,Piso en venta,https://www.fotocasa.es/es/comprar/vivienda/ma...,https://static.inmofactory.com/images/inmofact...,España,Barrio de Salamanca,Guindalera,,28028,Madrid,Flat,...,47,F,,,Muy bien,,0,0,"{'1': 'Aire acondicionado', '2': 'Armarios', '...",315000
6,Piso en venta en Calle Gardenia,https://www.fotocasa.es/es/comprar/vivienda/al...,https://static.inmofactory.com/images/inmofact...,España,Reyes Católicos,,Calle Gardenia,28803,Madrid,Flat,...,87,G,,,,,9,0,"{'1': 'Aire acondicionado', '2': 'Armarios', '...",145000
7,Ático en venta,https://www.fotocasa.es/es/comprar/vivienda/ma...,https://static.inmofactory.com/images/inmofact...,España,Chamberí,Trafalgar,,28010,Madrid,Flat,...,102,G,,,Muy bien,70 a 100 años,11,0,"{'3': 'Calefacción', '10': 'Terraza', '119': '...",700000
8,Piso en venta,https://www.fotocasa.es/es/comprar/vivienda/ma...,https://static.inmofactory.com/images/inmofact...,España,Golf - El Carralero,,,28222,Madrid,Flat,...,180,E,Gas Natural,Gas Natural,Muy bien,,0,0,"{'1': 'Aire acondicionado', '2': 'Armarios', '...",770000
9,Ático en venta,https://www.fotocasa.es/es/comprar/vivienda/ma...,https://static.inmofactory.com/images/inmofact...,España,Arganzuela,Imperial,,28005,Madrid,Flat,...,175,G,Gas Natural,Gas Natural,Muy bien,30 a 50 años,0,0,"{'1': 'Aire acondicionado', '2': 'Armarios', '...",995000


In [149]:
ft_scraping.data.tail(10)

Unnamed: 0,title,link,image_url,country,district,neighborhood,street,zipCode,province,buildingType,...,surface,energyCertificate,hotWater,heating,conservationState,antiquity,floor,surfaceLand,otherFeatures,price
28934,"Piso en venta en Calle Rafael Finat, 73",https://www.fotocasa.es/es/comprar/vivienda/ma...,https://static.fotocasa.es/images/anuncio/2022...,España,Latina,Las Águilas,"Calle Rafael Finat, 73",28044,Madrid,Flat,...,64,G,,,A reformar,50 a 70 años,0,0,"{'1': 'Aire acondicionado', '3': 'Calefacción'...",136000
28935,"Piso en venta en Calle de Atocha, 117",https://www.fotocasa.es/es/comprar/vivienda/ma...,https://static.inmofactory.com/images/inmofact...,España,Centro,Cortes - Huertas,"Calle de Atocha, 117",28012,Madrid,Flat,...,167,E,Gas Natural,Gas Natural,Bien,+ 100 años,7,0,"{'2': 'Armarios', '32': 'Balcón'}",899000
28936,"Piso en venta en Calle Colón, 1",https://www.fotocasa.es/es/comprar/vivienda/se...,https://static.fotocasa.es/images/anuncio/2022...,España,,,"Calle Colón, 1",28609,Madrid,Flat,...,52,G,,,Muy bien,10 a 20 años,0,0,{'32': 'Balcón'},155000
28937,Piso en venta en Calle Pablo Casals,https://www.fotocasa.es/es/comprar/vivienda/pa...,https://static.fotocasa.es/images/anuncio/2022...,España,Fuentebella -San Felix - El Leguario,,Calle Pablo Casals,28981,Madrid,Flat,...,94,G,,,Bien,30 a 50 años,0,0,{'10': 'Terraza'},122000
28938,"Piso en venta en Plaza de España, 3",https://www.fotocasa.es/es/comprar/vivienda/ge...,https://static.fotocasa.es/images/anuncio/2022...,España,Juan de la Cierva,,"Plaza de España, 3",28903,Madrid,Flat,...,65,G,,,Reformado,,0,0,"{'2': 'Armarios', '3': 'Calefacción', '10': 'T...",210000
28939,Piso en venta,https://www.fotocasa.es/es/comprar/vivienda/ma...,https://static.inmofactory.com/images/inmofact...,España,Barajas,Alameda de Osuna,,28042,Madrid,Flat,...,100,G,Gas Natural,Gas Natural,Casi nuevo,,0,0,"{'1': 'Aire acondicionado', '2': 'Armarios', '...",450000
28940,Piso en venta en Plaza de las Hilanderas,https://www.fotocasa.es/es/comprar/vivienda/ma...,https://static.fotocasa.es/images/anuncio/2022...,España,Carabanchel,Abrantes,Plaza de las Hilanderas,28025,Madrid,Flat,...,70,C,,,Bien,30 a 50 años,0,0,"{'1': 'Aire acondicionado', '2': 'Armarios', '...",150000
28941,Piso en venta en Calle Palacio,https://www.fotocasa.es/es/comprar/vivienda/ca...,https://static.fotocasa.es/images/anuncio/2022...,España,,,Calle Palacio,28560,Madrid,Flat,...,100,G,,,Casi nuevo,10 a 20 años,0,0,"{'2': 'Armarios', '7': 'Jardín Privado', '9': ...",115000
28942,Piso en venta en Calle Jacinto Benavente,https://www.fotocasa.es/es/comprar/vivienda/vi...,https://static.inmofactory.com/images/inmofact...,España,Villanueva de la Cañada ciudad,,Calle Jacinto Benavente,28691,Madrid,Flat,...,100,G,Propano,Propano,Bien,30 a 50 años,7,0,"{'2': 'Armarios', '6': 'Gres Cerámica', '10': ...",240000
28943,Casa o chalet en venta en Calle del Río Manzan...,https://www.fotocasa.es/es/comprar/vivienda/ri...,https://static.fotocasa.es/images/anuncio/2022...,España,Urbanizaciones,,"Calle del Río Manzanares, 16",28523,Madrid,Flat,...,255,C,,,Reformado,,0,336,"{'1': 'Aire acondicionado', '2': 'Armarios', '...",599000


In [150]:
ft_scraping.data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28944 entries, 0 to 28943
Data columns (total 29 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   title              28944 non-null  object
 1   link               28944 non-null  object
 2   image_url          28944 non-null  object
 3   country            28944 non-null  object
 4   district           28944 non-null  object
 5   neighborhood       28944 non-null  object
 6   street             28944 non-null  object
 7   zipCode            28944 non-null  object
 8   province           28944 non-null  object
 9   buildingType       28944 non-null  object
 10  clientAlias        28944 non-null  object
 11  latitude           28944 non-null  object
 12  longitude          28944 non-null  object
 13  isNewConstruction  28944 non-null  object
 14  rooms              28944 non-null  object
 15  bathrooms          28944 non-null  object
 16  parking            28944 non-null  objec