In [1]:
#Importing selenium necessary packages, here i'm working with chrome driver
from selenium import webdriver 
from selenium.webdriver.chrome.service import Service as ChromeService 
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By 
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException

# BeautifulSoup for parsing HTML
from bs4 import BeautifulSoup as soup

import requests
import time
import pymongo

In [4]:
#city = "djelfa" chosen because it's nearly the center of Algeria, and it's the best option to scrap data with a big radius !
base_url = "https://web.facebook.com/marketplace/112237105459123/propertyforsale/?"

#Djelfa code in url = 112237105459123

sorting = "price_descend"
radius = 25

#You can add another criteria to the url depending on your needs
minPrice = 2500
#maxPrice = ?
#longitude = ?
#latitude = ?

#consider changing the url as your needs
url = f"{base_url}minPrice={minPrice}&sortBy={sorting}&radius={radius}"

In [12]:
# Working with selenium headless browser
options = webdriver.ChromeOptions() 
options.headless = True

driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()), options=options) 
driver.get(url)

# Function that stores the announcements in the data base
def save_to_database(records):
    client = pymongo.MongoClient('mongodb://localhost:27017')
    mydb = client["Real-Estate"]
    information = mydb.fmp
    if information is not None:
        existing_urls = []
        for record in information.find():
            existing_urls.append(record['Link'])
                
        for record in records:
            if record['Link'] in existing_urls:
                continue
            else:
                information.insert_one(record)
    
    else:
        information.insert_many(records)

# A function that closes the Facebook popup login windows
def close_login_page():
    try:
        popup_close_button = driver.find_element(By.XPATH, "//div[@class='x92rtbv x10l6tqk x1tk7jg1 x1vjfegm']")
        popup_close_button.click()

        # Wait for the popup windows to close
        time.sleep(2)
    except NoSuchElementException:
        pass

def next_image():
    try:
        next_image_button = driver.find_element(By.XPATH, "//div[@aria-label='View next image']")
        next_image_button.click()

        # Wait for the image to be shown
        time.sleep(2)

        # Check if the new image source is different from the first one
        new_image_source = driver.find_element(By.XPATH, "//img").get_attribute("src")
        return new_image_source
    except NoSuchElementException:
        return None

# A function to scroll to the bottom of the page
def scroll_to_bottom(driver):
    close_login_page()
    last_height = driver.execute_script("return document.body.scrollHeight")
    while True:
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(6)
        new_height = driver.execute_script("return document.body.scrollHeight")
        if new_height == last_height:
            break
        last_height = new_height


# Scroll to the bottom of the page to load all content
scroll_to_bottom(driver)

# JavaScript to remove the Facebook banner from the DOM
script = """
var banner = document.querySelector('div.x78zum5.xdt5ytf.x2lah0s.x193iq5w.x2bj2ny.x1ey2m1c.xayqjjm.x9f619.xds687c.x1xy6bms.xn6708d.x1s14bel.x1ye3gou.xixxii4.x17qophe.x1u8a7rm');
if (banner) {
    banner.parentNode.removeChild(banner);
}
"""

# Execute the JavaScript with Selenium
driver.execute_script(script)


# Waiting for all the elements to be visible
wait = WebDriverWait(driver, 5|0)
parent_elements = wait.until(EC.visibility_of_all_elements_located((By.XPATH, "//a[@class='x1i10hfl xjbqb8w x1ejq31n xd10rxx x1sy0etr x17r0tee x972fbf xcfux6l x1qhh985 xm0m39n x9f619 x1ypdohk xt0psk2 xe8uvvx xdj266r x11i5rnm xat24cr x1mh8g0r xexx8yu x4uap5 x18d9i69 xkhd6sd x16tdsg8 x1hl2dhg xggy1nq x1a2a7pz x1heor9g x1sur9pj xkrqix3 x1lku1pv']")))

# Scraping data from each element and putting them in a list
listings_data = [] 

def remove_additional_information(text):
    toBeRemoved = ["[hidden information]", "See less"]
    for word in toBeRemoved:
        text = text.replace(word, '')     
    return text


# Collecting posts links firstly
parent_links = [parent_element.get_attribute("href") for parent_element in parent_elements]
print(len(parent_links))
# Iterate over links to visit each page separately

try:
    
    for listing_link in parent_links: 
        driver.get(listing_link)
        time.sleep(3)
    
        close_login_page()
        
        try:
            driver.execute_script("window.scrollBy(0, 300);")
            time.sleep(1)
            
            listing_description_element = driver.find_element(By.XPATH, ".//div[@class='xz9dl7a x4uap5 xsag5q8 xkhd6sd x126k92a']")
            see_more_button = None
            
            try:
                see_more_button = listing_description_element.find_element(By.XPATH, ".//span[contains(text(), 'See more')]")
            except NoSuchElementException:
                pass
            
            if see_more_button:
                see_more_button.click()
                time.sleep(3)
    
            listing_description = listing_description_element.text.strip()
            
            if not listing_description:
                listing_description = "Description not available"

            cleaned_description = listing_description.replace('\n', '')
            description = remove_additional_information(cleaned_description)
    

            listing_price = driver.find_element(By.XPATH, ".//span[@class='x193iq5w xeuugli x13faqbe x1vvkbs x1xmvt09 x1lliihq x1s928wv xhkezso x1gmr53x x1cpjm7i x1fgarty x1943h6x xudqn12 x676frb x1lkfr7t x1lbecb7 x1s688f xzsf02u']").text
            listing_title = driver.find_element(By.XPATH, ".//span[@class='x193iq5w xeuugli x13faqbe x1vvkbs x1xmvt09 x1lliihq x1s928wv xhkezso x1gmr53x x1cpjm7i x1fgarty x1943h6x x14z4hjw x3x7a5m xngnso2 x1qb5hxa x1xlr1w8 xzsf02u']").text
            
            for word in toBeRemoved:
                listing_title = listing_title.replace(word, "")
            
            listing_location = driver.find_element(By.XPATH, ".//span[@class='x193iq5w xeuugli x13faqbe x1vvkbs x1xmvt09 x1lliihq x1s928wv xhkezso x1gmr53x x1cpjm7i x1fgarty x1943h6x xudqn12 x3x7a5m x6prxxf xvq8zen xo1l8bm xzsf02u x1yc453h']").text 
            listing_image_elements = driver.find_elements(By.XPATH, ".//img")
            images_data = []
            first_image_source = listing_image_elements[0].get_attribute("src")
            
            images_data.append(first_image_source)  # Append the first image source
    
            while True:
                new_image_source = next_image()
                if new_image_source and new_image_source != first_image_source:
                    images_data.append(new_image_source)
                else:
                    break
            
        except Exception as e:
            print("Error:", e)
        
        temporary_listings_data = { 
            "Title": listing_title,
            "Price": listing_price,
            "Location": listing_location,
            "Description": description,
            "Images": images_data,
            "Source": "Facebook MarketPlace",
            "Date": None,
            "Link": listing_link,
            "Category": None,
            "Surface": None,
        } 
        listings_data.append(temporary_listings_data) 
        print(temporary_listings_data)
        
finally:
    save_to_database(listings_data)
    print("Scraped items: ", listings_data)




driver.quit()



21
{'Title': 'بيت للبيع', 'Price': 'DZD774,328,122', 'Location': 'الجلفة', 'Description': '4.30/12', 'Images': ['https://scontent.fczl2-1.fna.fbcdn.net/v/t45.5328-4/425644281_1588848748514756_126158019768302168_n.jpg?stp=dst-jpg_s960x960&_nc_cat=111&ccb=1-7&_nc_sid=247b10&_nc_ohc=PgVn3fXKApYQ7kNvgHWyB8c&_nc_ht=scontent.fczl2-1.fna&oh=00_AYC0RiDt1XipouwoCNZ8QrwbkuurqOmmsURYICJQgRPd4g&oe=664D5E4D'], 'Source': 'Facebook MarketPlace', 'Date': None, 'Link': 'https://www.facebook.com/marketplace/item/725212512898220/?ref=category_feed&referral_code=undefined&referral_story_type=listing&tracking=%7B%22qid%22%3A%22-1983630366595188999%22%2C%22mf_story_key%22%3A%227926576350687309%22%2C%22commerce_rank_obj%22%3A%22%7B%5C%22target_id%5C%22%3A7926576350687309%2C%5C%22target_type%5C%22%3A0%2C%5C%22primary_position%5C%22%3A0%2C%5C%22ranking_signature%5C%22%3A4891129470393652547%2C%5C%22commerce_channel%5C%22%3A504%2C%5C%22value%5C%22%3A4.7555141577301e-6%2C%5C%22candidate_retrieval_source_map%5C%22

NoSuchWindowException: Message: no such window: target window already closed
from unknown error: web view not found
  (Session info: chrome=124.0.6367.208)
Stacktrace:
	GetHandleVerifier [0x00AAC113+48259]
	(No symbol) [0x00A3CA41]
	(No symbol) [0x00930A17]
	(No symbol) [0x0090E02B]
	(No symbol) [0x0099742E]
	(No symbol) [0x009A9476]
	(No symbol) [0x00990B36]
	(No symbol) [0x0096570D]
	(No symbol) [0x009662CD]
	GetHandleVerifier [0x00D665A3+2908435]
	GetHandleVerifier [0x00DA3BBB+3159851]
	GetHandleVerifier [0x00B450CB+674875]
	GetHandleVerifier [0x00B4B28C+699900]
	(No symbol) [0x00A46244]
	(No symbol) [0x00A42298]
	(No symbol) [0x00A4242C]
	(No symbol) [0x00A34BB0]
	BaseThreadInitThunk [0x76897BA9+25]
	RtlInitializeExceptionChain [0x77E4BE3B+107]
	RtlClearBits [0x77E4BDBF+191]
