# Web Scraping Dubizzle_mobile_phones.com

In this notebook, I demonstrate **two different approaches** to writing Python web scraping code:

1. **Monolithic Approach (All-in-One Code)**  
   - In this approach, the entire scraping process is written in one continuous block of code.  
   - It includes fetching pages, parsing HTML, extracting product details, and writing to CSV all together.  
   - This method is simple for small scripts but can become hard to maintain and read for larger projects.

2. **Modular Approach (Using Functions)**  
   - In this approach, the code is divided into **functions** for each specific task, such as:
     - Creating the CSV file
     - Fetching a webpage
     - Parsing a product card
     - Writing data to CSV
     - Main loop controlling the scraping
   - This method improves **readability, reusability, and maintainability**.
   - Each function has a clear responsibility, making the code easier to debug and extend.

Both methods achieve the same end result: scraping mobile phone data from the website and saving it to a CSV file.  
The difference lies in **code organization and readability**.


# -------------------------------------------------------------------------------------------------

## 1- Monolithic Approach (All-in-One Code)

In [1]:
# Library
import requests
from bs4 import BeautifulSoup
import csv
import re
import pandas as pd
import time
import random

# `Note`
**`The code includes a random delay between each request to the server to protect and appear as a normal user, not a bot, so it can bypass the site's protection system and avoid being detected when sending multiple requests.`**

In [20]:
# Create a new CSV file and write header row (column names)
with open('dubizzle_mobile_phones.csv', 'w', encoding='utf-8', newline='') as f:
    writer = csv.writer(f)  # Initialize CSV writer
    writer.writerow([
        'product_name', 'price', 'seller', 'city', 'Governorate',
        'Brand', 'Model', 'RAM', 'Storage', 'Battery_Capacity',
        'Ad_Type', 'Payment_Option', 'Warranty', 'Condition', 'page_number', 'url'
    ])  # Write the column headers in the first line

page_number = 1  # Start scraping from page 1
count = 0  # Counter to track total number of scraped products
domain = 'https://www.dubizzle.com.eg'  # Base domain used to build full URLs
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) ""AppleWebKit/537.36 (KHTML, like Gecko) ""Chrome/120.0 Safari/537.36"}

# Loop to scrape multiple pages
while True:
    url = f'https://www.dubizzle.com.eg/en/mobile-phones-tablets-accessories-numbers/mobile-phones/?page={page_number}'  # Build page URL dynamically
    print(f'\n{"( Starting to scrape page : " + str(page_number) + " )":^80}')  # Print current page being scraped
    print(f'{"-"*40:^80}')  # Print separator line for clarity

    try:
        response = requests.get(url,headers=headers,timeout=10)  # Send GET request to fetch page HTML
        response.raise_for_status()  # Raise error if response status is not OK (200)
        soup = BeautifulSoup(response.text, 'html.parser')  # Parse the HTML content using BeautifulSoup
    except Exception as e:
        print(f"Error fetching main page {page_number}: {e}")  # Print error message if page request fails
        continue  # Continue scraping the next page if main page can't be fetched

    items = soup.find_all('div', attrs={'class': '_4631a0ca'})  # Find all product container divs
    if not items:  # If no items are found, it means no more pages
        print("No more items found. Stopping scraper.")  # Inform user scraping is done
        continue 

    # Open CSV file in append mode to add new rows
    with open('dubizzle_mobile_phones.csv', 'a', encoding='utf-8', newline='') as f:
        writer = csv.DictWriter(f, fieldnames=[
            'product_name', 'price', 'seller', 'city', 'Governorate',
            'Brand', 'Model', 'RAM', 'Storage', 'Battery_Capacity',
            'Ad_Type', 'Payment_Option', 'Warranty', 'Condition', 'page_number', 'url'
        ])  # Initialize DictWriter to write rows using dictionary format

        dash_line = "."  # Used to display a loading animation

        # Loop through each item (product) found on the current page
        for item in items:
            try:
                link = item.find('a')  # Find the <a> tag that contains product link
                if not link:  # If no link found, skip this product
                    continue
                second_url = domain + link.get('href')  # Build the full product detail URL
                time.sleep(random.uniform(5, 10))                         # Delay between each product to avoid being blocked

                
                
                # Send GET request to the product detail page
                second_response = requests.get(second_url,headers=headers,timeout=10)
                second_response.raise_for_status()  # Raise error if request failed
                second_soup = BeautifulSoup(second_response.text, 'html.parser')  # Parse product detail page

                print(f"\rLoading {dash_line}", end="")  # Print loading dots in one line
                dash_line += "."  # Add another dot to show progress

                # Extract product name safely
                product_name = 'N/A'  # Default value
                h1_tag = second_soup.find('h1')  # Find product title element
                if h1_tag and re.search(r'[A-Za-z0-9]', h1_tag.get_text()):  # Check if it has text
                    product_name = h1_tag.get_text(strip=True)  # Clean and assign name

                # Extract price safely
                price = 'N/A'  # Default value
                price_tag = second_soup.find('span', attrs={'class': '_24469da7'})  # Find price element
                if price_tag:  # If found
                    price = price_tag.get_text(strip=True)  # Clean text

                # Extract seller name safely
                seller = 'N/A'  # Default value
                seller_tag = second_soup.find('span', attrs={'class': '_9a85fb36 b7af14b4'}) or \
                             second_soup.find('span', attrs={'class': '_8206696c b7af14b4'})  # Try two class options
                if seller_tag:  # If found
                    seller = seller_tag.get_text(strip=True)  # Clean text

                # Extract location (city and governorate)
                city = 'N/A'
                Governorate = 'N/A'
                location_tag = second_soup.find('span', attrs={'aria-label': 'Location', 'class': 'a1c1940e'})  # Find location element
                if location_tag:  # If found
                    parts = [p.strip() for p in location_tag.get_text().split(',')]  # Split city and governorate by comma
                    if len(parts) > 0:
                        city = parts[0]  # Assign city
                    if len(parts) > 1:
                        Governorate = parts[1]  # Assign governorate

                # Extract product specifications from details section
                specs = {}  # Empty dictionary for specs
                for i in second_soup.find_all('div', attrs={'class': '_92439ac7'}):  # Loop over each spec section
                    spans = i.find_all('span')  # Find spans (key/value)
                    if len(spans) >= 2:  # Ensure there are both key and value
                        key = spans[0].get_text(strip=True)  # Spec name (e.g., "Brand")
                        val = spans[1].get_text(strip=True)  # Spec value (e.g., "Apple")
                        specs[key] = val  # Add to dictionary

                # Write the extracted data into CSV file
                writer.writerow({
                    'product_name': product_name,
                    'price': price,
                    'seller': seller,
                    'city': city,
                    'Governorate': Governorate,
                    'Brand': specs.get('Brand', 'N/A'),
                    'Model': specs.get('Model', 'N/A'),
                    'RAM': specs.get('RAM', 'N/A'),
                    'Storage': specs.get('Storage', 'N/A'),
                    'Battery_Capacity': specs.get('Battery Capacity', 'N/A'),
                    'Ad_Type': specs.get('Ad Type', 'N/A'),
                    'Payment_Option': specs.get('Payment Option', 'N/A'),
                    'Warranty': specs.get('Warranty', 'N/A'),
                    'Condition': specs.get('Condition', 'N/A'),
                    'page_number': page_number,
                    'url': second_url
                })  # Write one row (product) to CSV file

                count += 1  # Increment total product counter

            except Exception as e:
                print(f"\nError scraping product: {e}")  # Print product-specific error
                continue  # Continue scraping next product even if one fails

    print(f"\nSuccessfully scraped page ({page_number} of { soup.find_all('div',attrs={'title':'tiq7kl','class':'_44eaf83c'})[-1:][0].get_text() }) with total ({count}) products.".center(80, " "))  # Summary per page
    print('=' * 80)  # Separator line
    count=0  
    time.sleep(random.uniform(5,15))  #  Give the code a break before going to the next page (like a normal user)
        
    
    # Try to find if there is a "Next" page
    # If no next page exists
    if page_number==soup.find_all('div',attrs={'title':'tiq7kl','class':'_44eaf83c'})[-1:][0].get_text():
        print("No more pages found. Finished scraping.")  # End message
        break  # Stop the while loop
    else:
        page_number+=1  # Go to next page



                        ( Starting to scrape page : 1 )                         
                    ----------------------------------------                    
Loading ........
Error scraping product: HTTPSConnectionPool(host='www.dubizzle.com.eg', port=443): Read timed out. (read timeout=10)
Loading ............................................            
Successfully scraped page (1) with total (44) products.            

                        ( Starting to scrape page : 2 )                         
                    ----------------------------------------                    
Loading .............................................            
Successfully scraped page (2) with total (89) products.            

                        ( Starting to scrape page : 3 )                         
                    ----------------------------------------                    
Loading ........
Error scraping product: 404 Client Error: Not Found for url: https://www.dubizzle.com.eg/en/a

Loading ........

KeyboardInterrupt: 

- **if need to hiden the KeyboardInterrupt error a bove use to try and (except KeyboardInterrupt )**
- **total_products = count of all products from previous pages + count of products on the current page**
  - If you want total_products to represent only the count of products on each individual page, then after scraping all products from the current page, reset the counter to 0.

In [36]:
df= pd.read_csv('dubizzle_mobile_phones.csv')
df= df.dropna(inplace=True) # remove null
df

Unnamed: 0,product_name,price,seller,city,Governorate,Brand,Model,RAM,Storage,Battery_Capacity,Ad_Type,Payment_Option,Warranty,Condition,page_number,url
5,Samsung A56 new,"EGP 20,000",Mohamed Salah,New Nozha,Cairo,Samsung,A56,8,256 GB,(+) 5000 mAH,For Sale,Cash,Yes,New,1,https://www.dubizzle.com.eg/en/ad/samsung-a56-...
9,iPhone 13 Pro Max for sale,"EGP 39,000",نيمو,Maryotaya,Giza,Apple - iPhone,13 Pro Max,6,256 GB,(+) 4000 mAH,For Sale,Cash,No,Used,1,https://www.dubizzle.com.eg/en/ad/iphone-13-pr...
11,iphone 15 pro,"EGP 51,000",Adham Anbar,Sheikh Zayed,Giza,Apple - iPhone,15 Pro,8,256 GB,(+) 2000 mAH,For Sale,Cash,No,Used,1,https://www.dubizzle.com.eg/en/ad/iphone-15-pr...
13,Iphone 16 Pro Max 256 Gb Desert titanium معفي ...,"EGP 56,000",LORD,Maadi,Cairo,Apple - iPhone,16 Pro Max,8,256 GB,(+) 5000 mAH,For Sale,Cash,Yes,Used,1,https://www.dubizzle.com.eg/en/ad/iphone-16-pr...
14,Samsung Galaxy A16 جديد متبرشم – 256 جيجا | 8 ...,"EGP 9,800",Max Trade,Sheraton,Cairo,Samsung,A16,8,256 GB,(+) 5000 mAH,For Sale,Cash,Yes,New,1,https://www.dubizzle.com.eg/en/ad/samsung-gala...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
571,ايفون 6s plus,"EGP 3,200",dykwthnksrvjofd,10th of Ramadan,Sharqia,Apple - iPhone,6S Plus,3,128 GB,(+) 3000 mAH,For Sale,Cash,Yes,Used,14,https://www.dubizzle.com.eg/en/ad/%D8%A7%D9%8A...
574,ريلمى C3,"EGP 2,500",ممتاز عبدالهادي محمد,Ezbet El Nakhl,Cairo,Realme,C3,3,64 GB,(+) 5000 mAH,For Sale,Cash,No,Used,14,https://www.dubizzle.com.eg/en/ad/%D8%B1%D9%8A...
597,15 pro 128Gb ١٥ برو ١٢٨جيجا,"EGP 45,500",User 92n3ha,Sheikh Zayed,Giza,Apple - iPhone,15 Pro,6,128 GB,(+) 3000 mAH,For Sale,Cash,No,Used,14,https://www.dubizzle.com.eg/en/ad/15-pro-128gb...
610,a16. للبيع سامسونج جديد,"EGP 7,500",عمرو,Rod al-Farag,Cairo,Samsung,A16,8,256 GB,(+) 5000 mAH,For Sale,Cash,Yes,New,14,https://www.dubizzle.com.eg/en/ad/a16-%D9%84%D...


# `Note:`
- **The reason for the null values is that the website detected too many frequent requests. Due to its protection mechanisms, it returned empty `Html` responses, which caused the scraper to extract no data.**

# ------------------------------------------------------------------------------------------------------------

# 2- Modular Approach (Using Functions)

In [10]:
# CSV file and columns
csv_file = 'dubizzle_mobile_phones_1.csv'
fieldnames = [
    'product_name', 'price', 'seller', 'city', 'Governorate',
    'Brand', 'Model', 'RAM', 'Storage', 'Battery_Capacity',
    'Ad_Type', 'Payment_Option', 'Warranty', 'Condition', 'page_number', 'url' ]

# Headers for requests
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) ""AppleWebKit/537.36 (KHTML, like Gecko) ""Chrome/120.0 Safari/537.36"}

domain = 'https://www.dubizzle.com.eg'

In [11]:
def create_csv():
    """
    Create a CSV file and write the header row.
    
    This function initializes the CSV file where all scraped
    mobile phone data will be saved.
    """
    with open(csv_file, 'w', encoding='utf-8', newline='') as f:
        writer = csv.writer(f)
        writer.writerow(fieldnames)

In [12]:
def fetch_page(url):
    """
    Fetch a webpage and return a BeautifulSoup object.
    
    Parameters:
        url (str): The URL of the webpage to fetch.
    
    Returns:
        BeautifulSoup object if successful, None if request failed.
    """
    try:
        response = requests.get(url, headers=headers, timeout=10)
        response.raise_for_status()
        return BeautifulSoup(response.text, 'html.parser')
    except requests.RequestException as e:
        print(f"[Error] Failed to fetch {url}: {e}")
        return None

In [13]:
def parse_product_card(item, page_number):
    """
    Extract product details from a product card and its detail page.
    
    Parameters:
        item: BeautifulSoup element containing the product card
        page_number (int): Current page number
    
    Returns:
        dict: Product information
        None: If the detail page could not be fetched
    """
    try:
        link_tag = item.find('a')
        if not link_tag:
            return None

        second_url = domain + link_tag.get('href')
        time.sleep(random.uniform(2, 7))
        second_soup = fetch_page(second_url)
        if second_soup is None:
            return None

        # Product Name
        h1_tag = second_soup.find('h1')
        product_name = h1_tag.get_text(strip=True) if h1_tag else 'N/A'

        # Price
        price_tag = second_soup.find('span', attrs={'class': '_24469da7'})
        price = price_tag.get_text(strip=True) if price_tag else 'N/A'

        # Seller
        seller_tag = second_soup.find('span', attrs={'class': '_9a85fb36 b7af14b4'}) or \
                     second_soup.find('span', attrs={'class': '_8206696c b7af14b4'})
        seller = seller_tag.get_text(strip=True) if seller_tag else 'N/A'

        # Location
        city = Governorate = 'N/A'
        location_tag = second_soup.find('span', attrs={'aria-label': 'Location', 'class': 'a1c1940e'})
        if location_tag:
            parts = [p.strip() for p in location_tag.get_text().split(',')]
            if len(parts) > 0: city = parts[0]
            if len(parts) > 1: Governorate = parts[1]

        # Specifications
        specs = {}
        for div in second_soup.find_all('div', attrs={'class': '_92439ac7'}):
            spans = div.find_all('span')
            if len(spans) >= 2:
                specs[spans[0].get_text(strip=True)] = spans[1].get_text(strip=True)

        # Create a dictionary of all product information
        product_data = {
            'product_name': product_name,
            'price': price,
            'seller': seller,
            'city': city,
            'Governorate': Governorate,
            'Brand': specs.get('Brand', 'N/A'),
            'Model': specs.get('Model', 'N/A'),
            'RAM': specs.get('RAM', 'N/A'),
            'Storage': specs.get('Storage', 'N/A'),
            'Battery_Capacity': specs.get('Battery Capacity', 'N/A'),
            'Ad_Type': specs.get('Ad Type', 'N/A'),
            'Payment_Option': specs.get('Payment Option', 'N/A'),
            'Warranty': specs.get('Warranty', 'N/A'),
            'Condition': specs.get('Condition', 'N/A'),
            'page_number': page_number,
            'url': second_url
        }

        return product_data
    except Exception as e:
        print(f"[Error] Failed to parse product: {e}")
        return None


In [14]:
def write_to_csv(data):
    """
    Write a single product's data to the CSV file.
    
    Parameters:
        data (dict): Product information
    """
    with open(csv_file, 'a', encoding='utf-8', newline='') as f:
        writer = csv.DictWriter(f, fieldnames=fieldnames)
        writer.writerow(data)

In [15]:
def main():
    """
    Main function controlling the scraping process.
    
    - Creates CSV
    - Iterates through pages
    - Scrapes product cards
    - Writes data to CSV
    """
    create_csv()
    page_number = 1
    count = 0

    while True:
        url = f'{domain}/en/mobile-phones-tablets-accessories-numbers/mobile-phones/?page={page_number}'
        soup = fetch_page(url)
        if soup is None:
            print(f"Skipping page {page_number} due to fetch error.")
            page_number += 1
            continue

        print(f'( Starting to scrape page : {page_number} )'.center(80, " "))
        print("-" * 40)

        dash_line = "."
        items = soup.find_all('div', attrs={'class': '_4631a0ca'})
        if not items:
            print("No more items found. Stopping scraper.")
            break

        for item in items:
            print(f"\rLoading {dash_line}", end="")
            dash_line += "."
            product_data = parse_product_card(item, page_number)
            if product_data:
                write_to_csv(product_data)
                count += 1

        print(f"\nSuccessfully scraped page ({page_number}) with ({count}) products.".center(80, " "))
        print("="*80)
        count=0

        # Try to find if there is a "Next" page
        # If no next page exists
        if page_number==soup.find_all('div',attrs={'title':'tiq7kl','class':'_44eaf83c'})[-1:][0].get_text():
            print("No more pages found. Finished scraping.")  # End message
            break  # Stop the while loop
        else:
            page_number+=1  # Go to next page

In [None]:
# Run Script 
if __name__ == "__main__":
    main()
