In [3]:
import undetected_chromedriver as uc
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException, TimeoutException
import pandas as pd
import time
import random
import os
import sys

# File paths
CSV_FILE = "data.csv"
LAST_PAGE_FILE = "last_page.txt"

# Resume scraping from the last saved page (to avoid starting over)
if os.path.exists(LAST_PAGE_FILE):
    with open(LAST_PAGE_FILE, "r") as f:
        start_page = int(f.read())
else:
    start_page = 1 #If no file exists, start from page 1

# Load existing links to skip duplicates
if os.path.exists(CSV_FILE):
    df_existing = pd.read_csv(CSV_FILE)
    existing_links = set(df_existing["Link"])
else:
    existing_links = set()

#Set up Selenium Chrome options
options = uc.ChromeOptions()
options.headless = True
options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36")

# Start the WebDriver
driver = uc.Chrome(options=options)

# Function to scroll down the page
def scroll_down(driver):
    last_height = driver.execute_script("return document.body.scrollHeight")
    while True:
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(random.uniform(2, 4))  # Random sleep
        new_height = driver.execute_script("return document.body.scrollHeight")
        if new_height == last_height:
            break
        last_height = new_height

#  Function to safely find an element and return its text
def safe_find_element(parent, by, value):
    try:
        return parent.find_element(by, value).text
    except NoSuchElementException:
        return "N/A"

# List to store the data
data = []

try:
    for page in range(start_page, 42):  # Continue until page 41
        print(f"\n Processing page {page}...")
        url = f"https://olx.ba/pretraga?attr=&attr_encoded=1&category_id=2668&page={page}"
        driver.get(url)

        # Wait for the ads to load
        try:
            WebDriverWait(driver, 15).until(
                EC.presence_of_element_located((By.XPATH, '//a[contains(@class, "rounded-5 wrap")]'))
            )
        except TimeoutException:
            print(f"⚠ Page {page} - No ads found!")
            continue

        # Scroll down to load all ads
        scroll_down(driver)

        ads = driver.find_elements(By.XPATH, '//a[contains(@class, "rounded-5 wrap")]')
          
         # Check if there are any ads on the page 
        if not ads:
            print(f"⚠ Page {page} - No ads found!")
            continue

        # Loop through the ads
        for i in range(len(ads)):  
            try:
                ads = driver.find_elements(By.XPATH, '//a[contains(@class, "rounded-5 wrap")]')  # Ponovno pronalaženje
                oglas = ads[i]
                link = oglas.get_attribute("href")

                # Check if the ad was already scraped
                if link in existing_links:
                    print(f"Skipping already scraped ad: {link}")
                    continue

                title = safe_find_element(oglas, By.XPATH, './/h1')
                price = safe_find_element(oglas, By.XPATH, './/span[contains(@class, "smaller")]')

                print(f" Found ad: {title}, Price: {price}, Link: {link}")

                # Visit the ad page
                for attempt in range(3):
                    try:
                        driver.get(link)

                        # Wait for the details to load
                        details = WebDriverWait(driver, 30).until(
                            EC.presence_of_element_located((By.XPATH, "/html/body/div/div/div/div[1]/div/div[2]/div[2]/div/div[1]/div[2]/div/div[3]/div/table"))
                        )
                        details_text = details.text.replace("\n", " | ")

                        # Find other details
                        extra_details = safe_find_element(driver, By.XPATH, "/html/body/div/div/div/div[1]/div/div[2]/div[2]/div/div[1]/div[7]/div")
                        description = safe_find_element(driver, By.XPATH, "/html/body/div/div/div/div[1]/div/div[2]/div[2]/div/div[1]/div[1]/h1")
                        full_price = safe_find_element(driver, By.XPATH, "/html/body/div/div/div/div[1]/div/div[2]/div[2]/div/div[1]/div[1]/div[1]/div[1]/div/span")

                        break  # If successful, break the loop
                    except Exception as e:
                        print(f"Attempt {attempt + 1}: Error loading details ({e})")
                        if attempt == 2:  
                            details_text = "N/A"
                            extra_details = "N/A"
                            description = "N/A"
                            full_price = "N/A"

                # Add the data to the list
                data.append({
                    "Title": title,
                    "Price": price,
                    "Link": link,
                    "Details": details_text,
                    "Extra Details": extra_details,
                    "Description": description,
                    "Full Price": full_price
                })

                # Save the data to a CSV file
                df = pd.DataFrame(data)
                df.to_csv(CSV_FILE, mode='a', header=not os.path.exists(CSV_FILE), index=False, encoding="utf-8")
                data.clear()  

                # Go back to the search results
                driver.back()
                time.sleep(random.randint(3, 7))  

            except Exception as e:
                print(f"⚠ Error processing ad: {e}")

        # Save the last page number
        with open(LAST_PAGE_FILE, "w") as f:
            f.write(str(page))

        # Random sleep before moving to the next page
        time.sleep(random.randint(5, 15))

finally:
    driver.quit()

print("\n Scraping complete!") 
print(f"Total ads found: {len(data)}")
print(f"Data saved to {CSV_FILE}")
print(f"Last page scraped: {page}")



 Processing page 41...
Skipping already scraped ad: https://olx.ba/artikal/45748154?recommendation_source=homepage
Skipping already scraped ad: https://olx.ba/artikal/43596870?recommendation_source=homepage
Skipping already scraped ad: https://olx.ba/artikal/45481774?recommendation_source=homepage
Skipping already scraped ad: https://olx.ba/artikal/65915607?recommendation_source=homepage
Skipping already scraped ad: https://olx.ba/artikal/57522857?recommendation_source=homepage
Skipping already scraped ad: https://olx.ba/artikal/58950400?recommendation_source=homepage
Skipping already scraped ad: https://olx.ba/artikal/65849892?recommendation_source=homepage
Skipping already scraped ad: https://olx.ba/artikal/39240777?recommendation_source=homepage
Skipping already scraped ad: https://olx.ba/artikal/60018590?recommendation_source=homepage
Skipping already scraped ad: https://olx.ba/artikal/51758674?recommendation_source=homepage
Skipping already scraped ad: https://olx.ba/artikal/6580