In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import requests
import time
import os

In [None]:
import json
f = open('./token.json','r')
access = json.load(f)
f.close()

In [None]:
class idealista_search_api():
  
  def __init__(self,access:dict):
    self.token = access['access_token']
    self.token_type = access['token_type']
    self.country = 'es'
    self.api_version = '3.5'
    self.endpoint = f'https://api.idealista.com/{self.api_version}/{self.country}/search'
    
  def request(self,payload:dict):
    headers = {
        'Authorization': f'{self.token_type} {self.token}'
    }
    response = requests.post(self.endpoint,params=payload,headers=headers)
    if response.status_code == 200:
      return response
    response.raise_for_status()

  def extract_data(self,payload:dict) -> pd.DataFrame:
    print('this is the payload:',payload)
    data = self.request(payload).json()
    totalPages = data['totalPages']
    self.dataset = pd.DataFrame(data['elementList'])
    self.dataset.to_csv('idealista_real_estate_madrid_housing.csv')
    print('first extraction done')
    print(self.dataset)

    print(f'total pages: {totalPages}')

    for page in range(2,totalPages+1):
      time.sleep(1)
      print(page)
      payload['numPage'] = page
      data = self.request(payload).json()
      self.dataset = self.dataset.append(data['elementList'])
      pd.DataFrame(data['elementList']).to_csv('idealista_real_estate_madrid_housing.csv',mode='a')

In [None]:
idealista = idealista_search_api(access)

In [None]:
payload = {
    'operation':'sale',
    'propertyType':'homes',
    'maxItems':50,
    'order':'price',
    'sort':'desc',
    'locationId':'0-EU-ES-28-07-001-079'
    }

idealista.extract_data(payload)

In [None]:
f = open('idealista_results.json','r')
text = ''
for l in f:
  text += l
  print(l)
f.close()

{"elementList":[{"propertyCode":"98286055","thumbnail":"https://img3.idealista.com/blur/WEB_LISTING/0/id.pro.es.image.master/c9/fd/90/1006698313.jpg","externalReference":"58248","numPhotos":62,"floor":"2","price":284900.0,"propertyType":"flat","operation":"sale","size":75.0,"exterior":true,"rooms":3,"bathrooms":1,"address":"Calle de Sambara, 43","province":"Madrid","municipality":"Madrid","district":"Ciudad Lineal","country":"es","neighborhood":"Quintana","latitude":40.4343062,"longitude":-3.6524697,"showAddress":true,"url":"https://www.idealista.com/inmueble/98286055/","description":"VIVIENDA DE 75 m2 con REFORMADA, DECORADA, AMUEBLADA Y CON GARANTÍA. El piso está formado por salón comedor, cocina, tres dormitorios, baño. Al entrar en la vivienda nos encontramos con la cocina amueblada en color blanco con encimera en madera y equipada con electrodomésticos. El salón de la vivienda es amplio y luminoso, gracias a su terraza integrada a la estancia, tiene lámpara de diseño que añade van

In [None]:
df = pd.DataFrame(json.loads(text)['elementList'])
df

Unnamed: 0,propertyCode,thumbnail,externalReference,numPhotos,floor,price,propertyType,operation,size,exterior,...,detailedType,suggestedTexts,hasPlan,has3DTour,has360,hasStaging,superTopHighlight,topNewDevelopment,parkingSpace,labels
0,98286055,https://img3.idealista.com/blur/WEB_LISTING/0/...,58248,62,2,284900.0,flat,sale,75.0,True,...,{'typology': 'flat'},"{'subtitle': 'Quintana, Madrid', 'title': 'Pis...",True,True,False,False,False,False,,
1,98284505,https://img3.idealista.com/blur/WEB_LISTING/0/...,56753,49,1,294900.0,flat,sale,90.0,True,...,{'typology': 'flat'},"{'subtitle': 'Quintana, Madrid', 'title': 'Pis...",True,True,False,False,False,False,,
2,97313277,https://img3.idealista.com/blur/WEB_LISTING/0/...,BS169771,75,3,835000.0,flat,sale,111.0,False,...,{'typology': 'flat'},"{'subtitle': 'Recoletos, Madrid', 'title': 'Pi...",True,True,False,True,False,False,"{'hasParkingSpace': True, 'isParkingSpaceInclu...",
3,98231381,https://img3.idealista.com/blur/WEB_LISTING/0/...,BS175620,77,bj,950000.0,duplex,sale,112.0,True,...,"{'typology': 'flat', 'subTypology': 'duplex'}","{'subtitle': 'Castellana, Madrid', 'title': 'D...",True,True,False,False,False,False,,
4,95386296,https://img3.idealista.com/blur/WEB_LISTING/0/...,MV4178I,72,1,1995000.0,flat,sale,274.0,True,...,{'typology': 'flat'},"{'subtitle': 'Castellana, Madrid', 'title': 'P...",True,True,False,True,False,False,"{'hasParkingSpace': True, 'isParkingSpaceInclu...","[{'name': 'luxuryType', 'text': 'Lujo'}]"
5,97466696,https://img3.idealista.com/blur/WEB_LISTING/0/...,17208,26,1,2500000.0,flat,sale,463.0,True,...,{'typology': 'flat'},"{'subtitle': 'El Viso, Madrid', 'title': 'Piso...",True,True,False,False,False,False,"{'hasParkingSpace': True, 'isParkingSpaceInclu...","[{'name': 'luxuryType', 'text': 'Lujo'}]"
6,98185667,https://img3.idealista.com/blur/WEB_LISTING/0/...,MV4491I,52,1,1125000.0,flat,sale,162.0,True,...,{'typology': 'flat'},"{'subtitle': 'Castellana, Madrid', 'title': 'P...",True,True,False,True,False,False,"{'hasParkingSpace': True, 'isParkingSpaceInclu...","[{'name': 'luxuryType', 'text': 'Lujo'}]"
7,94189842,https://img3.idealista.com/blur/WEB_LISTING/0/...,13634,51,,2750000.0,chalet,sale,720.0,False,...,"{'typology': 'chalet', 'subTypology': 'indepen...","{'subtitle': 'Conde Orgaz-Piovera, Madrid', 't...",True,True,False,True,False,False,"{'hasParkingSpace': True, 'isParkingSpaceInclu...","[{'name': 'villaType', 'text': 'Villa'}, {'nam..."
8,98034360,https://img3.idealista.com/blur/WEB_LISTING/0/...,V225Q15,65,,1690000.0,chalet,sale,482.0,False,...,"{'typology': 'chalet', 'subTypology': 'terrace...","{'subtitle': 'Conde Orgaz-Piovera, Madrid', 't...",True,True,False,False,False,False,"{'hasParkingSpace': True, 'isParkingSpaceInclu...","[{'name': 'luxuryType', 'text': 'Lujo'}]"
9,98250542,https://img3.idealista.com/blur/WEB_LISTING/0/...,MV175766,43,2,1999000.0,flat,sale,439.0,True,...,{'typology': 'flat'},"{'subtitle': 'Gaztambide, Madrid', 'title': 'P...",True,True,False,False,False,False,,


In [None]:
df.columns

Index(['propertyCode', 'thumbnail', 'externalReference', 'numPhotos', 'floor',
       'price', 'propertyType', 'operation', 'size', 'exterior', 'rooms',
       'bathrooms', 'address', 'province', 'municipality', 'district',
       'country', 'neighborhood', 'latitude', 'longitude', 'showAddress',
       'url', 'description', 'hasVideo', 'status', 'newDevelopment', 'hasLift',
       'priceByArea', 'detailedType', 'suggestedTexts', 'hasPlan', 'has3DTour',
       'has360', 'hasStaging', 'superTopHighlight', 'topNewDevelopment',
       'parkingSpace', 'labels'],
      dtype='object')

There are some columns that are crearly not needed, like thumbnail, externalReference, numPhotos, country, province and municipality (we already know it is the city of Madrid), hasVideo, has360, operation (it's always sale),hasPlan, has3DTour, has360 and more.


In [None]:
cols_to_drop = ['thumbnail', 'externalReference', 'numPhotos',
                'country', 'province', 'municipality', 
                'hasVideo', 'has360', 'operation', 'hasPlan',
                'has3DTour', 'has360', 'hasStaging','description','url','latitude',
                'longitude','showAddress', 'address', 'superTopHighlight', 'topNewDevelopment']
df = df.drop(cols_to_drop,axis=1)
del cols_to_drop
df

Unnamed: 0,propertyCode,floor,price,propertyType,size,exterior,rooms,bathrooms,district,neighborhood,status,newDevelopment,hasLift,priceByArea,detailedType,suggestedTexts,superTopHighlight,topNewDevelopment,parkingSpace,labels
0,98286055,2,284900.0,flat,75.0,True,3,1,Ciudad Lineal,Quintana,good,False,True,3799.0,{'typology': 'flat'},"{'subtitle': 'Quintana, Madrid', 'title': 'Pis...",False,False,,
1,98284505,1,294900.0,flat,90.0,True,3,2,Ciudad Lineal,Quintana,good,False,False,3277.0,{'typology': 'flat'},"{'subtitle': 'Quintana, Madrid', 'title': 'Pis...",False,False,,
2,97313277,3,835000.0,flat,111.0,False,2,3,Barrio de Salamanca,Recoletos,renew,False,True,7523.0,{'typology': 'flat'},"{'subtitle': 'Recoletos, Madrid', 'title': 'Pi...",False,False,"{'hasParkingSpace': True, 'isParkingSpaceInclu...",
3,98231381,bj,950000.0,duplex,112.0,True,2,1,Barrio de Salamanca,Castellana,good,False,True,8482.0,"{'typology': 'flat', 'subTypology': 'duplex'}","{'subtitle': 'Castellana, Madrid', 'title': 'D...",False,False,,
4,95386296,1,1995000.0,flat,274.0,True,5,5,Barrio de Salamanca,Castellana,good,False,True,7281.0,{'typology': 'flat'},"{'subtitle': 'Castellana, Madrid', 'title': 'P...",False,False,"{'hasParkingSpace': True, 'isParkingSpaceInclu...","[{'name': 'luxuryType', 'text': 'Lujo'}]"
5,97466696,1,2500000.0,flat,463.0,True,4,5,Chamartín,El Viso,good,False,True,5400.0,{'typology': 'flat'},"{'subtitle': 'El Viso, Madrid', 'title': 'Piso...",False,False,"{'hasParkingSpace': True, 'isParkingSpaceInclu...","[{'name': 'luxuryType', 'text': 'Lujo'}]"
6,98185667,1,1125000.0,flat,162.0,True,4,3,Barrio de Salamanca,Castellana,good,False,True,6944.0,{'typology': 'flat'},"{'subtitle': 'Castellana, Madrid', 'title': 'P...",False,False,"{'hasParkingSpace': True, 'isParkingSpaceInclu...","[{'name': 'luxuryType', 'text': 'Lujo'}]"
7,94189842,,2750000.0,chalet,720.0,False,6,6,Hortaleza,Conde Orgaz-Piovera,good,False,,3819.0,"{'typology': 'chalet', 'subTypology': 'indepen...","{'subtitle': 'Conde Orgaz-Piovera, Madrid', 't...",False,False,"{'hasParkingSpace': True, 'isParkingSpaceInclu...","[{'name': 'villaType', 'text': 'Villa'}, {'nam..."
8,98034360,,1690000.0,chalet,482.0,False,4,6,Hortaleza,Conde Orgaz-Piovera,good,False,,3506.0,"{'typology': 'chalet', 'subTypology': 'terrace...","{'subtitle': 'Conde Orgaz-Piovera, Madrid', 't...",False,False,"{'hasParkingSpace': True, 'isParkingSpaceInclu...","[{'name': 'luxuryType', 'text': 'Lujo'}]"
9,98250542,2,1999000.0,flat,439.0,True,5,3,Chamberí,Gaztambide,good,False,True,4554.0,{'typology': 'flat'},"{'subtitle': 'Gaztambide, Madrid', 'title': 'P...",False,False,,


In [None]:
df.columns

Index(['propertyCode', 'floor', 'price', 'propertyType', 'size', 'exterior',
       'rooms', 'bathrooms', 'district', 'neighborhood', 'status',
       'newDevelopment', 'hasLift', 'priceByArea', 'detailedType',
       'suggestedTexts', 'superTopHighlight', 'topNewDevelopment',
       'parkingSpace', 'labels'],
      dtype='object')

In [None]:
df['suggestedTexts']

0     {'subtitle': 'Quintana, Madrid', 'title': 'Pis...
1     {'subtitle': 'Quintana, Madrid', 'title': 'Pis...
2     {'subtitle': 'Recoletos, Madrid', 'title': 'Pi...
3     {'subtitle': 'Castellana, Madrid', 'title': 'D...
4     {'subtitle': 'Castellana, Madrid', 'title': 'P...
5     {'subtitle': 'El Viso, Madrid', 'title': 'Piso...
6     {'subtitle': 'Castellana, Madrid', 'title': 'P...
7     {'subtitle': 'Conde Orgaz-Piovera, Madrid', 't...
8     {'subtitle': 'Conde Orgaz-Piovera, Madrid', 't...
9     {'subtitle': 'Gaztambide, Madrid', 'title': 'P...
10    {'subtitle': 'Arapiles, Madrid', 'title': 'Pis...
11    {'subtitle': 'Goya, Madrid', 'title': 'Piso en...
12    {'subtitle': 'San Pascual, Madrid', 'title': '...
13    {'subtitle': 'Berruguete, Madrid', 'title': 'P...
14    {'subtitle': 'Recoletos, Madrid', 'title': 'Pi...
15    {'subtitle': 'Recoletos, Madrid', 'title': 'Pi...
16    {'subtitle': 'Recoletos, Madrid', 'title': 'Pi...
17    {'subtitle': 'Almagro, Madrid', 'title': '

In [None]:
df['suggestedTexts'].apply(pd.Series)

Unnamed: 0,subtitle,title
0,"Quintana, Madrid","Piso en Calle de Sambara, 43"
1,"Quintana, Madrid","Piso en Calle Sambara, 101"
2,"Recoletos, Madrid",Piso en AYALA
3,"Castellana, Madrid","Dúplex en Calle de Claudio Coello, 125"
4,"Castellana, Madrid",Piso en Calle de Núñez de Balboa
5,"El Viso, Madrid",Piso en Calle de Alfonso Rodríguez Santamaría
6,"Castellana, Madrid",Piso en Calle de Castelló
7,"Conde Orgaz-Piovera, Madrid",Casa independiente en Ronda de Cala Basa
8,"Conde Orgaz-Piovera, Madrid","Chalet adosado en Calle Toronga, 13"
9,"Gaztambide, Madrid",Piso en Calle DE CEA BERMUDEZ


In [None]:
df['suggestedTexts'].apply(pd.Series)['subtitle']==df['neighborhood'].apply(lambda x: x+', Madrid')

0     True
1     True
2     True
3     True
4     True
5     True
6     True
7     True
8     True
9     True
10    True
11    True
12    True
13    True
14    True
15    True
16    True
17    True
18    True
19    True
dtype: bool

In [None]:
df = df.drop('suggestedTexts',axis=1)
df

Unnamed: 0,propertyCode,floor,price,propertyType,size,exterior,rooms,bathrooms,district,neighborhood,status,newDevelopment,hasLift,priceByArea,detailedType,superTopHighlight,topNewDevelopment,parkingSpace,labels
0,98286055,2,284900.0,flat,75.0,True,3,1,Ciudad Lineal,Quintana,good,False,True,3799.0,{'typology': 'flat'},False,False,,
1,98284505,1,294900.0,flat,90.0,True,3,2,Ciudad Lineal,Quintana,good,False,False,3277.0,{'typology': 'flat'},False,False,,
2,97313277,3,835000.0,flat,111.0,False,2,3,Barrio de Salamanca,Recoletos,renew,False,True,7523.0,{'typology': 'flat'},False,False,"{'hasParkingSpace': True, 'isParkingSpaceInclu...",
3,98231381,bj,950000.0,duplex,112.0,True,2,1,Barrio de Salamanca,Castellana,good,False,True,8482.0,"{'typology': 'flat', 'subTypology': 'duplex'}",False,False,,
4,95386296,1,1995000.0,flat,274.0,True,5,5,Barrio de Salamanca,Castellana,good,False,True,7281.0,{'typology': 'flat'},False,False,"{'hasParkingSpace': True, 'isParkingSpaceInclu...","[{'name': 'luxuryType', 'text': 'Lujo'}]"
5,97466696,1,2500000.0,flat,463.0,True,4,5,Chamartín,El Viso,good,False,True,5400.0,{'typology': 'flat'},False,False,"{'hasParkingSpace': True, 'isParkingSpaceInclu...","[{'name': 'luxuryType', 'text': 'Lujo'}]"
6,98185667,1,1125000.0,flat,162.0,True,4,3,Barrio de Salamanca,Castellana,good,False,True,6944.0,{'typology': 'flat'},False,False,"{'hasParkingSpace': True, 'isParkingSpaceInclu...","[{'name': 'luxuryType', 'text': 'Lujo'}]"
7,94189842,,2750000.0,chalet,720.0,False,6,6,Hortaleza,Conde Orgaz-Piovera,good,False,,3819.0,"{'typology': 'chalet', 'subTypology': 'indepen...",False,False,"{'hasParkingSpace': True, 'isParkingSpaceInclu...","[{'name': 'villaType', 'text': 'Villa'}, {'nam..."
8,98034360,,1690000.0,chalet,482.0,False,4,6,Hortaleza,Conde Orgaz-Piovera,good,False,,3506.0,"{'typology': 'chalet', 'subTypology': 'terrace...",False,False,"{'hasParkingSpace': True, 'isParkingSpaceInclu...","[{'name': 'luxuryType', 'text': 'Lujo'}]"
9,98250542,2,1999000.0,flat,439.0,True,5,3,Chamberí,Gaztambide,good,False,True,4554.0,{'typology': 'flat'},False,False,,


In [None]:
df['detailedType'].apply(pd.Series)

Unnamed: 0,typology,subTypology
0,flat,
1,flat,
2,flat,
3,flat,duplex
4,flat,
5,flat,
6,flat,
7,chalet,independantHouse
8,chalet,terracedHouse
9,flat,


In [None]:
df.explode('labels')

Unnamed: 0,propertyCode,floor,price,propertyType,size,exterior,rooms,bathrooms,district,neighborhood,status,newDevelopment,hasLift,priceByArea,detailedType,superTopHighlight,topNewDevelopment,parkingSpace,labels
0,98286055,2,284900.0,flat,75.0,True,3,1,Ciudad Lineal,Quintana,good,False,True,3799.0,{'typology': 'flat'},False,False,,
1,98284505,1,294900.0,flat,90.0,True,3,2,Ciudad Lineal,Quintana,good,False,False,3277.0,{'typology': 'flat'},False,False,,
2,97313277,3,835000.0,flat,111.0,False,2,3,Barrio de Salamanca,Recoletos,renew,False,True,7523.0,{'typology': 'flat'},False,False,"{'hasParkingSpace': True, 'isParkingSpaceInclu...",
3,98231381,bj,950000.0,duplex,112.0,True,2,1,Barrio de Salamanca,Castellana,good,False,True,8482.0,"{'typology': 'flat', 'subTypology': 'duplex'}",False,False,,
4,95386296,1,1995000.0,flat,274.0,True,5,5,Barrio de Salamanca,Castellana,good,False,True,7281.0,{'typology': 'flat'},False,False,"{'hasParkingSpace': True, 'isParkingSpaceInclu...","{'name': 'luxuryType', 'text': 'Lujo'}"
5,97466696,1,2500000.0,flat,463.0,True,4,5,Chamartín,El Viso,good,False,True,5400.0,{'typology': 'flat'},False,False,"{'hasParkingSpace': True, 'isParkingSpaceInclu...","{'name': 'luxuryType', 'text': 'Lujo'}"
6,98185667,1,1125000.0,flat,162.0,True,4,3,Barrio de Salamanca,Castellana,good,False,True,6944.0,{'typology': 'flat'},False,False,"{'hasParkingSpace': True, 'isParkingSpaceInclu...","{'name': 'luxuryType', 'text': 'Lujo'}"
7,94189842,,2750000.0,chalet,720.0,False,6,6,Hortaleza,Conde Orgaz-Piovera,good,False,,3819.0,"{'typology': 'chalet', 'subTypology': 'indepen...",False,False,"{'hasParkingSpace': True, 'isParkingSpaceInclu...","{'name': 'villaType', 'text': 'Villa'}"
7,94189842,,2750000.0,chalet,720.0,False,6,6,Hortaleza,Conde Orgaz-Piovera,good,False,,3819.0,"{'typology': 'chalet', 'subTypology': 'indepen...",False,False,"{'hasParkingSpace': True, 'isParkingSpaceInclu...","{'name': 'luxuryType', 'text': 'Lujo'}"
8,98034360,,1690000.0,chalet,482.0,False,4,6,Hortaleza,Conde Orgaz-Piovera,good,False,,3506.0,"{'typology': 'chalet', 'subTypology': 'terrace...",False,False,"{'hasParkingSpace': True, 'isParkingSpaceInclu...","{'name': 'luxuryType', 'text': 'Lujo'}"


In [31]:
import requests

headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:102.0) Gecko/20100101 Firefox/102.0',
           'referer': 'https://www.idealista.com/buscar/venta-viviendas/madrid-madrid/madrid/' }

url = urlsplit('https://www.idealista.com/buscar/venta-viviendas/madrid-madrid/madrid/pagina-1.htm?ordenado-por=fecha-publicacion-desc')


response = requests.get(url.geturl(), headers=headers)
response.status_code

200

In [36]:
from bs4 import BeautifulSoup
from urllib.parse import urlsplit

soup = BeautifulSoup(response.text,'lxml')

links = [url.scheme+'://'+url.hostname+x.get('href') for x in soup.select('a.item-link')]

In [37]:
links

['https://www.idealista.com/inmueble/98318058/?xtmc=1_1_madrid&xtcr=0',
 'https://www.idealista.com/inmueble/98316250/?xtmc=1_1_madrid&xtcr=0',
 'https://www.idealista.com/inmueble/98318012/?xtmc=1_1_madrid&xtcr=0',
 'https://www.idealista.com/inmueble/98316371/?xtmc=1_1_madrid&xtcr=0',
 'https://www.idealista.com/inmueble/98316567/?xtmc=1_1_madrid&xtcr=0',
 'https://www.idealista.com/inmueble/98316630/?xtmc=1_1_madrid&xtcr=0',
 'https://www.idealista.com/inmueble/98317810/?xtmc=1_1_madrid&xtcr=0',
 'https://www.idealista.com/inmueble/98314629/?xtmc=1_1_madrid&xtcr=0',
 'https://www.idealista.com/inmueble/98316612/?xtmc=1_1_madrid&xtcr=0',
 'https://www.idealista.com/inmueble/98317314/?xtmc=1_1_madrid&xtcr=0',
 'https://www.idealista.com/inmueble/98317294/?xtmc=1_1_madrid&xtcr=0',
 'https://www.idealista.com/inmueble/98300888/?xtmc=1_1_madrid&xtcr=0',
 'https://www.idealista.com/inmueble/98317156/?xtmc=1_1_madrid&xtcr=0',
 'https://www.idealista.com/inmueble/98317019/?xtmc=1_1_madrid&x

In [43]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urlsplit
import time


headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:102.0) Gecko/20100101 Firefox/102.0',
           'referer': 'https://www.idealista.com/buscar/venta-viviendas/madrid-madrid/madrid/' }


links  = []

for page in range(1,3):

  url = urlsplit(f'https://www.idealista.com/buscar/venta-viviendas/madrid-madrid/madrid/pagina-{page}.htm?ordenado-por=fecha-publicacion-desc')
  response = requests.get(url.geturl(), headers=headers)
  if response.status_code == 200:
    soup = BeautifulSoup(response.text,'lxml')
    thisPageLinks = [url.scheme+'://'+url.hostname+x.get('href') for x in soup.select('a.item-link')]
    links.extend(thisPageLinks)
    headers['referer'] = url.geturl()
    print(f'New referrer: {headers["referer"]}')
    time.sleep(3)
  else:
    raise f'Request returned error {response.status_code}'

links

New referrer: https://www.idealista.com/buscar/venta-viviendas/madrid-madrid/madrid/pagina-1.htm?ordenado-por=fecha-publicacion-desc
New referrer: https://www.idealista.com/buscar/venta-viviendas/madrid-madrid/madrid/pagina-2.htm?ordenado-por=fecha-publicacion-desc


['https://www.idealista.com/inmueble/98316464/?xtmc=1_1_madrid&xtcr=0',
 'https://www.idealista.com/inmueble/98318058/?xtmc=1_1_madrid&xtcr=0',
 'https://www.idealista.com/inmueble/98316250/?xtmc=1_1_madrid&xtcr=0',
 'https://www.idealista.com/inmueble/98318012/?xtmc=1_1_madrid&xtcr=0',
 'https://www.idealista.com/inmueble/98316371/?xtmc=1_1_madrid&xtcr=0',
 'https://www.idealista.com/inmueble/98316567/?xtmc=1_1_madrid&xtcr=0',
 'https://www.idealista.com/inmueble/98316630/?xtmc=1_1_madrid&xtcr=0',
 'https://www.idealista.com/inmueble/98317810/?xtmc=1_1_madrid&xtcr=0',
 'https://www.idealista.com/inmueble/98314629/?xtmc=1_1_madrid&xtcr=0',
 'https://www.idealista.com/inmueble/98316612/?xtmc=1_1_madrid&xtcr=0',
 'https://www.idealista.com/inmueble/98317314/?xtmc=1_1_madrid&xtcr=0',
 'https://www.idealista.com/inmueble/98317294/?xtmc=1_1_madrid&xtcr=0',
 'https://www.idealista.com/inmueble/98300888/?xtmc=1_1_madrid&xtcr=0',
 'https://www.idealista.com/inmueble/98317156/?xtmc=1_1_madrid&x

In [None]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urlsplit
import time


headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:102.0) Gecko/20100101 Firefox/102.0',
           'referer': 'https://www.idealista.com/buscar/venta-viviendas/madrid-madrid/madrid/' }


links  = []

for page in range(1,61):

  url = urlsplit(f'https://www.idealista.com/buscar/venta-viviendas/madrid-madrid/madrid/pagina-{page}.htm?ordenado-por=fecha-publicacion-desc')
  response = requests.get(url.geturl(), headers=headers)
  if response.status_code == 200:
    soup = BeautifulSoup(response.text,'lxml')
    thisPageLinks = [url.scheme+'://'+url.hostname+x.get('href') for x in soup.select('a.item-link')]
    links.extend(thisPageLinks)
    headers['referer'] = url.geturl()
    print(f'Page {page} completed..')
    time.sleep(3)
  else:
    raise f'Request returned error {response.status_code}'

In [51]:
links

['https://www.idealista.com/inmueble/98316716/?xtmc=1_1_madrid&xtcr=0',
 'https://www.idealista.com/inmueble/98316464/?xtmc=1_1_madrid&xtcr=0',
 'https://www.idealista.com/inmueble/98318058/?xtmc=1_1_madrid&xtcr=0',
 'https://www.idealista.com/inmueble/98316250/?xtmc=1_1_madrid&xtcr=0',
 'https://www.idealista.com/inmueble/98318012/?xtmc=1_1_madrid&xtcr=0',
 'https://www.idealista.com/inmueble/98316371/?xtmc=1_1_madrid&xtcr=0',
 'https://www.idealista.com/inmueble/98316567/?xtmc=1_1_madrid&xtcr=0',
 'https://www.idealista.com/inmueble/98316630/?xtmc=1_1_madrid&xtcr=0',
 'https://www.idealista.com/inmueble/98317810/?xtmc=1_1_madrid&xtcr=0',
 'https://www.idealista.com/inmueble/98314629/?xtmc=1_1_madrid&xtcr=0',
 'https://www.idealista.com/inmueble/98316612/?xtmc=1_1_madrid&xtcr=0',
 'https://www.idealista.com/inmueble/98317314/?xtmc=1_1_madrid&xtcr=0',
 'https://www.idealista.com/inmueble/98317294/?xtmc=1_1_madrid&xtcr=0',
 'https://www.idealista.com/inmueble/98300888/?xtmc=1_1_madrid&x

In [52]:
import pandas as pd

pd.DataFrame(links, columns=['links']).to_csv('links.csv')

In [118]:
index_counter = 0
for link in links:
  response = requests.get(link,headers=headers)
  if response.status_code == 200:
    soup = BeautifulSoup(response.text,'lxml')
    utag_script = [x for x in soup.select('script') if 'utag_data' in x.get_text()]
    utag_data = json.loads(str(utag_script[0]).split(';')[0].split(' ')[7])
    home_data = {
        'id':utag_data['ad']['id'],
        'price':utag_data['ad']['price'],
        'size':utag_data['ad']['characteristics']['constructedArea'],
        'hasParking':utag_data['ad']['characteristics']['hasParking'],
        'roomNumber':utag_data['ad']['characteristics']['roomNumber'],
        'bathNumber':utag_data['ad']['characteristics']['bathNumber'],
        'hasSwimmingPool':utag_data['ad']['characteristics']['hasSwimmingPool'] or "0",
        'hasTerrace':utag_data['ad']['characteristics']['hasTerrace'],
        'hasGarden':utag_data['ad']['characteristics']['hasGarden'],
        'hasLift':utag_data['ad']['characteristics']['hasLift'],
        'isGoodCondition':utag_data['ad']['condition']['isGoodCondition'],
        'isNeedsRenovating':utag_data['ad']['condition']['isNeedsRenovating'],
        'isNewDevelopment':utag_data['ad']['condition']['isNewDevelopment']
    }
    if index_counter == 0:
      idealista_dataset = pd.DataFrame(home_data, index=[index_counter])
    else:
      idealista_dataset.append(home_data,ignore_index=True)
    print(index_counter)
    time.sleep(3)
    index_counter += 1
  else:
    raise f"Error while processing {link}"

0
1


KeyError: ignored

In [53]:
response = requests.get(links[0],headers=headers)
response.status_code

200

In [None]:
soup = BeautifulSoup(response.text,'lxml')
soup

In [69]:
utag_script = [x for x in soup.select('script') if 'utag_data' in x.get_text()]

In [91]:
utag_data = json.loads(str(utag_script[0]).split(';')[0].split(' ')[7])

In [95]:
utag_data['ad']

{'address': {'locationId': '0-EU-ES-28-07-001-079-03-004',
  'locationLevel': '8',
  'municipalityId': '0-EU-ES-28-07-001-079',
  'provinceId': '0-EU-ES-28'},
 'builtType': '2',
 'characteristics': {'bathNumber': '1',
  'constructedArea': '42',
  'hasGarden': '0',
  'hasLift': '1',
  'hasParking': '0',
  'hasSwimmingPool': '0',
  'hasTerrace': '0',
  'roomNumber': '2'},
 'condition': {'isGoodCondition': '0',
  'isNeedsRenovating': '1',
  'isNewDevelopment': '0'},
 'energyCertification': {'suffix': '', 'type': 'unknown'},
 'hasRecommended': '1',
 'id': '98316716',
 'isAuction': '0',
 'isRecommended': '0',
 'media': {'has3Dtour': '0',
  'hasFloorPlan': '1',
  'hasHomeStaging': '0',
  'photoNumber': '18',
  'videoNumber': '0'},
 'numberRecommended': '',
 'operation': '1',
 'origin': '1',
 'originTypeRecommended': '',
 'owner': {'chatIsActive': '1',
  'commercialId': '',
  'commercialName': '',
  'contactPreference': '1',
  'type': '1'},
 'price': '345000',
 'recommendationId': '',
 'typeR

In [99]:
home_data = {
    'id':utag_data['ad']['id'],
    'price':utag_data['ad']['price'],
    'size':utag_data['ad']['characteristics']['constructedArea'],
    'hasParking':utag_data['ad']['characteristics']['hasParking'],
    'roomNumber':utag_data['ad']['characteristics']['roomNumber'],
    'bathNumber':utag_data['ad']['characteristics']['bathNumber'],
    'hasSwimmingPool':utag_data['ad']['characteristics']['hasSwimmingPool'],
    'hasTerrace':utag_data['ad']['characteristics']['hasTerrace'],
    'hasGarden':utag_data['ad']['characteristics']['hasGarden'],
    'hasLift':utag_data['ad']['characteristics']['hasLift'],
    'isGoodCondition':utag_data['ad']['condition']['isGoodCondition'],
    'isNeedsRenovating':utag_data['ad']['condition']['isNeedsRenovating'],
    'isNewDevelopment':utag_data['ad']['condition']['isNewDevelopment']
}

In [100]:
home_data

{'bathNumber': '1',
 'hasGarden': '0',
 'hasLift': '1',
 'hasParking': '0',
 'hasSwimmingPool': '0',
 'hasTerrace': '0',
 'id': '98316716',
 'isGoodCondition': '0',
 'isNeedsRenovating': '1',
 'isNewDevelopment': '0',
 'price': '345000',
 'roomNumber': '2',
 'size': '42'}

In [114]:
pd.DataFrame(home_data,index=[0])

Unnamed: 0,id,price,size,hasParking,roomNumber,bathNumber,hasSwimmingPool,hasTerrace,hasGarden,hasLift,isGoodCondition,isNeedsRenovating,isNewDevelopment
0,98316716,345000,42,0,2,1,0,0,0,1,0,1,0
