-
Notifications
You must be signed in to change notification settings - Fork 0
/
idealista_scraper.py
77 lines (59 loc) · 2.62 KB
/
idealista_scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
import requests
from bs4 import BeautifulSoup
import csv
from datetime import datetime
import yaml
headers = ["Title", "Link", "Type", "Asking", "Notes", "Where",
"Size (garden)", "Rooms", "Rating", "Plan", "Agent", "Status"]
with open("config.yml", "r") as yaml_file:
config = yaml.safe_load(yaml_file)
api_key = config.get("zenrows_api_key")
with open("data/idealista_urls.txt", "r") as url_file:
urls = [line.strip() for line in url_file if line.strip()]
timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
output_filename = f"output/idealista_{timestamp}.csv"
with open(output_filename, mode="w", newline="") as csv_file:
writer = csv.writer(csv_file)
writer.writerow(headers)
for url in urls:
params = {
'url': url,
'apikey': api_key,
'premium_proxy': 'true',
}
response = requests.get('https://api.zenrows.com/v1/', params=params)
if response.status_code == 200:
soup = BeautifulSoup(response.text, 'html.parser')
# Title
title = soup.find("span", class_="main-info__title-main").text.strip()
# Price
span_element = soup.find("span", class_="info-data-price")
price = span_element.find("span", class_="txt-bold").text.replace(".", "").strip()
# Rooms & Size
div_element = soup.find("div", class_="info-features")
spans = div_element.find_all("span")
size = spans[0].text.replace("m2", "m²").strip()
rooms = spans[1].text.replace("locali", "").strip()
# Type
div_element = soup.find("div", class_="details-property_features")
li_elements = div_element.find_all("li")
property_type = li_elements[0].text.strip()
# Agent
a_element = soup.find("a", class_="about-advertiser-name")
if a_element is not None:
agent = a_element.text.strip()
else:
agent = "Privato"
# Print the extracted data
print("Url", url)
print("Title:", title)
print("Price:", price)
print("Size:", size)
print("Rooms:", rooms)
print("Type:", property_type)
print("Agent:", agent)
writer.writerow([title, url, property_type, price, "", "", size, rooms, "", "", agent, ""])
else:
print(url, "Failed to retrieve the webpage. Status code:", response.status_code)
print("-------")
print(f"CSV file '{output_filename}' has been created with all the extracted values and headers.")