In [3]:
from bs4 import BeautifulSoup
import requests

def extract_listing_info(html_content):
    """
    Extracts information about property listings from HTML content.

    Args:
        html_content: The HTML content of the page containing the listings.

    Returns:
        A list of dictionaries, where each dictionary represents a property listing
        and contains the extracted information.
    """
    soup = BeautifulSoup(html_content, 'html.parser')
    listings = []

    listing_cards = soup.find_all('div', {'data-id': 'listing-card-container'})
    
    print(listing_cards)

    for card in listing_cards:
        listing = {}

        # ID
        listing['id'] = card.parent.get('id').replace('listing-list-card-', '')

        # Price
        price_element = card.find('b', class_='relative leading-[140%]')
        if price_element:
            listing['price'] = price_element.text.strip()

        # Location
        location_element = card.find('div', class_='w-full relative leading-[170%]')
        if location_element:
            listing['location'] = location_element.text.strip()

        # Type
        type_element = card.find('b', class_='relative leading-[160%] truncate')
        if type_element:
            listing['type'] = type_element.text.strip()

        # Area
        area_element = card.find_all('b', class_='relative leading-[160%] whitespace-nowrap')[0]
        if area_element:
            listing['area'] = area_element.text.strip()

        # Bedrooms
        bedrooms_element = card.find_all('b', class_='relative leading-[160%] whitespace-nowrap')[1]
        if bedrooms_element:
            listing['bedrooms'] = bedrooms_element.text.strip()

        # Bathrooms
        bathrooms_element = card.find_all('b', class_='relative leading-[160%] whitespace-nowrap')[2]
        if bathrooms_element:
            listing['bathrooms'] = bathrooms_element.text.strip()

        # Agent
        agent_element = card.find('img', class_='w-[3rem] relative rounded h-[3rem] object-cover object-top')
        if agent_element:
            listing['agent'] = agent_element.get('alt')

        # Image URL
        img_element = card.find('img', class_='opacity-100 absolute block top-1/2 left-1/2 w-full h-full -translate-y-1/2 -translate-x-1/2 transition-opacity duration-200 object-center object-cover')
        if img_element:
            listing['image_url'] = img_element.get('data-src')

        # Virtual Tour (check for the presence of the "Visita Virtual" element)
        virtual_tour_element = card.find('b', class_='relative leading-[160%]')
        listing['virtual_tour'] = 'Yes' if virtual_tour_element and virtual_tour_element.text.strip() == 'Visita Virtual' else 'No'

        # Link
        link_element = card.find('a', {'data-id': 'listing-card-link'})
        if link_element:
            listing['link'] = "https://www.idealista.pt" + link_element.get('href')

        listings.append(listing)

    return listings

# Example usage (if you have the HTML content in a file):
# with open("your_html_file.html", "r", encoding="utf-8") as f:
#     html_content = f.read()



In [4]:
# Example usage (if you want to fetch the HTML content from a URL):
url = 'https://www.remax.pt/pt/arrendar/imoveis/habitacao/lisboa/lisboa/r/t,preco__1200?s=%7B%22rg%22%3A%22Lisboa%22%7D&p=1&o=-PublishDate'
response = requests.get(url)
response.encoding = 'utf-8' # Ensure correct encoding
html_content = response.text

listings_data = extract_listing_info(html_content)

# Print the extracted data (or process it further as needed)
for listing in listings_data:
    print(listing)

[]


In [None]:
#read the houses from the excel file


import pandas as pd



houses = pd.read_csv('data/houses.csv')

#filter out houses with no image urls
houses = [house for house in houses if house['Image URLs']]



#print how many urls are in the houses per house
for house in houses:
    print(len(house['Image URLs']))



In [3]:
import random
from bs4 import BeautifulSoup
import time
import logging
import json
from datetime import datetime
from seleniumwire import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.common.exceptions import TimeoutException, WebDriverException
import backoff

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler('scraper.log'),
        logging.StreamHandler()
    ]
)

# Proxy configuration
username = 'nuno5645_lEtvV'
password = 'Qf9ZcCL3Vc_sDsZ'
proxy_url = f'http://{username}:{password}@pt-pr.oxylabs.io:10000'

# Scraping configurations
MAX_RETRIES = 3
TIMEOUT = 30
MIN_DELAY = 3
MAX_DELAY = 8
MAX_PAGES = 21

# List of user agents to rotate
USER_AGENTS = [
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Edge/120.0.0.0',
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:120.0) Gecko/20100101 Firefox/120.0'
]

# Headers template
HEADERS_TEMPLATE = [
    ('Accept', 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7'),
    ('Accept-Language', 'pt-PT,pt;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6'),
    ('Accept-Encoding', 'gzip, deflate, br'),
    ('Connection', 'keep-alive'),
    ('Upgrade-Insecure-Requests', '1'),
    ('Sec-Fetch-Dest', 'document'),
    ('Sec-Fetch-Mode', 'navigate'),
    ('Sec-Fetch-Site', 'none'),
    ('Sec-Fetch-User', '?1')
]

class PropertyScraper:
    def __init__(self):
        self.listings_data = []
        self.current_page = 1
        
        # Setup Chrome options for Selenium with proxy
        chrome_options = Options()
        #chrome_options.add_argument('--headless')
        chrome_options.add_argument('--no-sandbox')
        chrome_options.add_argument('--disable-dev-shm-usage')
        chrome_options.add_argument('--disable-gpu')
        chrome_options.add_argument('--window-size=1920,1080')
        chrome_options.add_argument('--ignore-certificate-errors')
        chrome_options.add_argument('--disable-extensions')
        
        # Configure proxy with authentication
        proxy_options = {
            'proxy': {
                'http': proxy_url,
                'https': proxy_url
            }
        }
        
        # Add the same user agent rotation
        user_agent = random.choice(USER_AGENTS)
        chrome_options.add_argument(f'--user-agent={user_agent}')
        
        # Add additional headers
        for header, value in HEADERS_TEMPLATE:
            chrome_options.add_argument(f'--header={header}: {value}')
        
        try:
            self.driver = webdriver.Chrome(options=chrome_options, seleniumwire_options=proxy_options)
            self.driver.set_page_load_timeout(TIMEOUT)
            self.driver.implicitly_wait(10)  # Add implicit wait
        except Exception as e:
            logging.error(f"Failed to initialize WebDriver: {str(e)}")
            raise

    def __del__(self):
        if hasattr(self, 'driver'):
            self.driver.quit()

    def load_progress(self):
        """Load progress from file if exists"""
        try:
            with open('scraping_progress.json', 'r') as f:
                progress = json.load(f)
                self.current_page = progress.get('last_page', 1)
                self.listings_data = progress.get('listings', [])
                logging.info(f"Resumed from page {self.current_page}")
        except FileNotFoundError:
            logging.info("No previous progress found. Starting fresh.")

    def save_progress(self):
        """Save current progress to file"""
        progress = {
            'last_page': self.current_page,
            'listings': self.listings_data,
            'timestamp': datetime.now().isoformat()
        }
        with open('scraping_progress.json', 'w') as f:
            json.dump(progress, f)
        logging.info(f"Progress saved at page {self.current_page}")

    def save_results(self):
        """Save final results to file"""
        filename = f'property_listings_{datetime.now().strftime("%Y%m%d_%H%M%S")}.json'
        with open(filename, 'w', encoding='utf-8') as f:
            json.dump(self.listings_data, f, ensure_ascii=False, indent=2)
        logging.info(f"Results saved to {filename}")

    @backoff.on_exception(
        backoff.expo,
        (TimeoutException, WebDriverException),
        max_tries=MAX_RETRIES,
        max_time=60
    )
    def make_request(self, url):
        """Make request using Selenium with retry logic"""
        try:
            # Update User-Agent randomly before each request
            user_agent = random.choice(USER_AGENTS)
            logging.info(f"Selected User-Agent: {user_agent}")
            self.driver.execute_cdp_cmd('Network.setUserAgentOverride', {
                "userAgent": user_agent
            })
            
            # Clear cookies and cache before each request
            self.driver.delete_all_cookies()
            logging.info("Cleared all cookies.")
            
            logging.info(f"Navigating to URL: {url}")
            self.driver.get(url)
            logging.info(f"Navigation complete for URL: {url}")
            logging.info("Scrolling to bottom to trigger lazy loading...")
            self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
            time.sleep(2)  # wait a bit for lazy loading
            try:
                WebDriverWait(self.driver, TIMEOUT).until(
                    EC.presence_of_element_located((By.CSS_SELECTOR, 'article'))
                )
            except TimeoutException:
                logging.warning("Timeout waiting for items, checking for blocking...")
                if 'challenge-form' in self.driver.page_source:
                    logging.warning("Detected challenge form, waiting...")
                    time.sleep(30)
                    raise TimeoutException("Challenge form detected")
            
            return self.driver.page_source
            
        except Exception as e:
            logging.error(f"Request error for URL {url}: {str(e)}")
            raise

    def parse_listing(self, listing_element):
        """Parse individual listing data using Selenium elements"""
        logging.info("Starting to parse a listing element.")

        try:
            title_element = listing_element.find_element(By.CSS_SELECTOR, 'a.item-link')
            price_element = listing_element.find_element(By.CSS_SELECTOR, '.item-price')
            logging.info("Found title and price elements for a listing.")
            
            try:
                location_element = listing_element.find_element(By.CSS_SELECTOR, '.item-detail-location')
                location = location_element.text.strip().replace('\n', ', ')
                logging.info("Location element found for listing.")
            except:
                location = None
                logging.info("Location element not found; set location to None.")
            
            listing_url = title_element.get_attribute('href')
            logging.info(f"Extracted listing URL: {listing_url}")
            
            listing_data = {
                'title': title_element.text.strip(),
                'price': price_element.text.strip(),
                'location': location,
                'url': listing_url,
                'images': [],
                'scraped_at': datetime.now().isoformat()
            }
            logging.info(f"Basic listing info - Title: {listing_data['title']}, Price: {listing_data['price']}, Location: {listing_data['location']}")
            
            # Get images if URL exists
            if listing_url:
                listing_data['images'] = self.get_listing_images(listing_url)
                logging.info(f"Found {len(listing_data['images'])} images for listing: {listing_data['title']}")
            
            logging.info(f"Returning parsed data for listing: {listing_data['title']}")
            return listing_data

        except Exception as e:
            logging.error(f"Error parsing listing: {str(e)}")
            return None

    def get_listing_images(self, url):
        """Get all images from a listing page"""
        try:
            self.driver.get(url)
            
            # Wait for gallery to load
            WebDriverWait(self.driver, TIMEOUT).until(
                EC.presence_of_element_located((By.CLASS_NAME, 'gallery-thumbs'))
            )
            
            # Get all image elements
            image_elements = self.driver.find_elements(By.CSS_SELECTOR, '.gallery-thumbs img')
            logging.info(f"Found {len(image_elements)} image elements on listing page: {url}")
            
            # Extract image URLs and clean them
            image_urls = []
            for img in image_elements:
                src = img.get_attribute('src')
                if src:
                    # Convert thumbnail URL to full-size image URL
                    full_size_url = src.replace('thumb', 'big')
                    image_urls.append(full_size_url)
            
            # Add random delay between requests
            delay = random.uniform(MIN_DELAY, MAX_DELAY)
            time.sleep(delay)
            
            return image_urls
            
        except Exception as e:
            logging.error(f"Error getting images for URL {url}: {str(e)}")
            return []

    def scrape_pages(self):
        """Main scraping function using Selenium"""
        try:
            for page_number in range(self.current_page, MAX_PAGES):
                self.current_page = page_number
                max_retries = 3
                retry_count = 0
                
                while retry_count < max_retries:
                    logging.info(f"Attempt {retry_count + 1} for page {page_number}")
                    try:
                        # Build URL with pagination
                        if page_number == 1:
                            url = 'https://www.idealista.pt/arrendar-casas/lisboa/'
                        else:
                            url = f'https://www.idealista.pt/arrendar-casas/lisboa/pagina-{page_number}'

                        logging.info(f"Processing Page {page_number}")
                        
                        page_source = self.make_request(url)
                        
                        # Check for blocking
                        if 'challenge-form' in page_source or 'Access denied' in page_source:
                            logging.warning("Possible blocking detected. Waiting longer...")
                            time.sleep(60)
                            retry_count += 1
                            continue
                        
                        # Find all listing elements
                        listings = self.driver.find_elements(By.CSS_SELECTOR, 'article')

                        if not listings:
                            logging.warning(f"No listings found on page {page_number}")
                            retry_count += 1
                            continue

                        # Process listings
                        for listing in listings:
                            listing_data = self.parse_listing(listing)
                            if listing_data:
                                self.listings_data.append(listing_data)

                        logging.info(f"Completed processing {len(listings)} listings on page {page_number}")

                        # Save progress periodically
                        if page_number % 5 == 0:
                            self.save_progress()

                        # Random delay before next request
                        delay = random.uniform(MIN_DELAY, MAX_DELAY)
                        logging.info(f"Waiting {delay:.2f} seconds before next request...")
                        time.sleep(delay)
                        
                        # If we got here, break the retry loop
                        break

                    except Exception as error:
                        logging.error(f"Error on page {page_number}: {str(error)}")
                        retry_count += 1
                        if retry_count >= max_retries:
                            logging.error(f"Max retries reached for page {page_number}")
                            continue
                        time.sleep(random.uniform(10, 20))

            # Save final results
            self.save_results()
            logging.info("Scraping completed successfully!")

        except KeyboardInterrupt:
            logging.info("Scraping interrupted by user")
            self.save_progress()
            self.save_results()
        except Exception as e:
            logging.error(f"Fatal error: {str(e)}")
            self.save_progress()
            self.save_results()
        finally:
            if hasattr(self, 'driver'):
                self.driver.quit()

if __name__ == "__main__":
    scraper = PropertyScraper()
    scraper.scrape_pages()


2025-01-31 23:04:22,881 - INFO - Using default request storage
2025-01-31 23:04:22,889 - INFO - Created proxy listening on ::ffff:127.0.0.1:54843
2025-01-31 23:04:24,926 - INFO - Attempt 1 for page 1
2025-01-31 23:04:24,929 - INFO - Processing Page 1
2025-01-31 23:04:24,930 - INFO - Selected User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:120.0) Gecko/20100101 Firefox/120.0
2025-01-31 23:04:24,944 - INFO - Cleared all cookies.
2025-01-31 23:04:24,948 - INFO - Navigating to URL: https://www.idealista.pt/arrendar-casas/lisboa/
2025-01-31 23:04:25,391 - INFO - Capturing request: https://accounts.google.com/ListAccounts?gpsia=1&source=ChromiumBrowser&json=standard
2025-01-31 23:04:25,495 - INFO - Capturing request: https://www.idealista.pt/arrendar-casas/lisboa/
2025-01-31 23:04:25,548 - INFO - Capturing response: https://accounts.google.com/ListAccounts?gpsia=1&source=ChromiumBrowser&json=standard 200 
2025-01-31 23:04:25,712 - INFO - Capturing response: https://www.idealista.pt/

In [5]:
import requests
from bs4 import BeautifulSoup

class MeoScraper:
    @staticmethod
    def scrape_sensacoes_meos():
        url = "https://loja.meo.pt/sensacoes-meos"
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
        }
        response = requests.get(url, headers=headers)
        soup = BeautifulSoup(response.text, 'html.parser')
        
        products = []
        product_items = soup.select('li.sf-item')
        
        for item in product_items:
            product = {}
            
            # Get product name
            name_elem = item.select_one('h3.font-semibold')
            if name_elem:
                product['name'] = name_elem.get('title', '').strip()
            else:
                continue  # Skip if no name found
            
            # Get points value
            points_elem = item.select_one('.sf-item-footer-item .sf-item-details span:nth-of-type(2)')
            if points_elem and points_elem.text.strip().isdigit():
                product['points'] = int(points_elem.text.strip())
            else:
                continue  # Skip if no points found
            
            # Get image URL
            img_elem = item.select_one('.sf-item-image img')
            if img_elem:
                product['image_url'] = img_elem.get('src', '')
            else:
                product['image_url'] = ''
            
            # Get link URL and check availability
            link_elem = item.select_one('a.sf-item-wrapper')
            if link_elem:
                product['link_url'] = link_elem.get('href', '')
                product['available'] = 'no-stock' not in link_elem.get('class', [])
            else:
                product['link_url'] = ''
                product['available'] = False
            
            # Additional information
            product['url'] = url
            product['description'] = product['name']
            product['stock'] = 0 if not product['available'] else 1
            
            products.append(product)
        
        return products
    
    @staticmethod
    def get_test_product():
        return {
            'name': 'Test Product SMS',
            'points': 1000,
            'url': 'https://example.com/test-product',
            'available': True,
            'image_url': 'https://example.com/test-product-image.jpg',
            'link_url': 'https://example.com/test-product',
            'description': 'This is a test product to trigger SMS notification',
            'stock': 5
        }

In [6]:
print(MeoScraper.scrape_sensacoes_meos())

[{'name': 'Convite 2 Pessoas Antestreia Lisboa Mickey 17', 'points': 450, 'image_url': 'https://conteudos.meo.pt/catalogo/isell/sensacoes/cinema-uci/cinema-uci-mickey-17-2025-meo.webp', 'link_url': 'https://loja.meo.pt/produto/meo/convite-2-pessoas-antestreia-lisboa-mickey-17', 'available': True, 'url': 'https://loja.meo.pt/sensacoes-meos', 'description': 'Convite 2 Pessoas Antestreia Lisboa Mickey 17', 'stock': 1}, {'name': 'Convite 2 Pessoas Antestreia Porto Mickey 17', 'points': 450, 'image_url': 'https://conteudos.meo.pt/catalogo/isell/sensacoes/cinema-uci/cinema-uci-mickey-17-2025-meo.webp', 'link_url': 'https://loja.meo.pt/produto/meo/convite-2-pessoas-antestreia-porto-mickey-17', 'available': True, 'url': 'https://loja.meo.pt/sensacoes-meos', 'description': 'Convite 2 Pessoas Antestreia Porto Mickey 17', 'stock': 1}, {'name': 'Convite Duplo Camarote MEOS - Ricky Gervais Mortality - 27 fev - 19h30', 'points': 1500, 'image_url': 'https://conteudos.meo.pt/catalogo/isell/sensacoes/r