In [6]:
import requests
from bs4 import BeautifulSoup
import csv
from time import sleep
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry

total_counter = 0

# Function to create a session with retry mechanism
def create_session():
    session = requests.Session()
    retries = Retry(total=5, backoff_factor=1, status_forcelist=[500, 502, 503, 504])
    session.mount('https://', HTTPAdapter(max_retries=retries))
    session.mount('http://', HTTPAdapter(max_retries=retries))
    return session

# Open a CSV file in write mode
with open('hatla2ee_scraped_data1.csv', 'w', newline='', encoding='utf-8') as csvfile:
    # Define fieldnames for the CSV file
    fieldnames = ['Name', 'Price', 'Color', 'Mileage', 'Make', 'Model', 'City', 'Date Displayed', 'Automatic Transmission', 'Air Conditioner', 'Power Steering', 'Remote Control', 'Item URL']
    
    # Initialize a CSV writer object
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    
    # Write the header row
    writer.writeheader()
    
    session = create_session()

    for i in range(1, 750):
    #for i in range(1, 1000):
        url = f"https://eg.hatla2ee.com/en/car/page/{i}"
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
        }

        try:
            # Send a GET request to the URL using the session
            response = session.get(url, headers=headers)
            response.raise_for_status()  # Raises HTTPError for bad responses

            # Parse the HTML content
            soup = BeautifulSoup(response.content, "html.parser")

            # Find all car cards
            car_cards = soup.find_all("div", class_="newCarListUnit_contain")
            if not car_cards:
                break

            counter = 0

            # Iterate over each car card
            for card in car_cards:
                #print (card)
                car_data = {
                    'Name': None, 'Price': None, 'Color': None, 'Mileage': None,
                    'Make': None, 'Model': None, 'City': None, 'Date Displayed': None,
                    'Automatic Transmission': 'No', 'Air Conditioner': 'No',
                    'Power Steering': 'No', 'Remote Control': 'No', 'Item URL': None
                }

                try:
                    car_data['Name'] = card.find("div", class_="newCarListUnit_header").text.strip()
                except AttributeError:
                    pass

                try:
                    car_data['Price'] = card.find("div", class_="main_price").text.strip()
                    if car_data['Price'] == "-":
                        car_data['Price'] = None
                except AttributeError:
                    pass

                try:
                    meta_tags = card.find_all("span", class_="newCarListUnit_metaTag")
                    car_data['Color'] = meta_tags[0].text.strip()
                    car_data['Mileage'] = meta_tags[-1].text.strip()
                    if car_data['Mileage'] == "- Km":
                        car_data['Mileage'] = None
                except (AttributeError, IndexError):
                    pass

                try:
                    meta_links = card.find("div", class_="newCarListUnit_metaTags").find_all("span", class_="newCarListUnit_metaLink")
                    car_data['Make'] = meta_links[0].text.strip()
                    car_data['Model'] = meta_links[1].text.strip()
                    car_data['City'] = meta_links[-1].text.strip()
                except (AttributeError, IndexError):
                    pass

                try:
                    car_data['Date Displayed'] = card.find("div", class_="otherData_Date").find("span").text.strip()
                except AttributeError:
                    pass
                
                try:
                    icons_element = card.find("div", class_="otherData_carType")
                    if icons_element.find('i', {'title': 'Automatic'}):
                        car_data['Automatic Transmission'] = 'Yes'
                    if icons_element.find('i', {'title': 'Air Conditioner'}):
                        car_data['Air Conditioner'] = 'Yes'
                    if icons_element.find('i', {'title': 'Power Steering'}):
                        car_data['Power Steering'] = 'Yes'
                    if icons_element.find('i', {'title': 'Remote Control'}):
                        car_data['Remote Control'] = 'Yes'
                except AttributeError:
                    pass
                
                try:
                    car_data['Item URL'] = f"https://eg.hatla2ee.com{card.find('div', class_='newMainImg').find('a').get('href')}"
                except AttributeError:
                    pass
                
                # Write the row to the CSV file
                writer.writerow(car_data)
                counter += 1

            print(f"***** Page {i} Scrapped Successfully with {counter} Items *****")
            total_counter += counter
            sleep(5)
        except requests.RequestException as e:
            print(f"Error occurred while scraping page {i}: {e}")
            sleep(30)


***** Page 1 Scrapped Successfully with 40 Items *****
***** Page 2 Scrapped Successfully with 40 Items *****
***** Page 3 Scrapped Successfully with 40 Items *****
***** Page 4 Scrapped Successfully with 40 Items *****


In [None]:
# Importing required libraries
import requests
from bs4 import BeautifulSoup
from pandas import *
import csv
import re
import json

#need to scrap cars from following page : https://www.dubizzle.com.eg/en/vehicles/cars-for-sale/?page=2
#open csv file to write data in it
with open('olx_scraped_data1.csv', 'w', newline='', encoding='utf-8') as csvfile:
#with open('tmmmp', 'w', newline='', encoding='utf-8') as csvfile:
    fieldnames = ['Make', 'Model', 'Year', 'Price' , 'Transmission', 'Kilometers', 'Fuel']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    writer.writeheader()
    Make_array = []
    Model_array = []
    Year_array = []
    Price_array = []
    Transmission_array = []
    Kilometers_array = []
    Fuel_array = []
    #loop through all pages
    for i in range(1, 200):
#    for i in range(2, 3):
        print ("scrapping page " , i)
        url = f"https://www.dubizzle.com.eg/en/vehicles/cars-for-sale/?page={i}"
        cars_dev = requests.get(url)
        soup = BeautifulSoup(cars_dev.content, 'html.parser')
        # Print the HTML content for debugging
        page_content = soup.prettify()
        #print (page_content)
        #"name_l1":"Brand","attribute":"make","formattedValue":"مرسيدس بنز","formattedValue_l1":"Mercedes-Benz"},{"name":"السنة","name_l1":"Year","attribute":"year","formattedValue":"2022","formattedValue_l1":"2022"},{"name":"موديل","name_l1":"Model","attribute":"model","formattedValue":"سي 180","formattedValue_l1":"C180"},{"name":"السعر","name_l1":"Price","attribute":"price","formattedValue":"3,300,000","formattedValue_l1":"3,300,000"}, ,{"name":"ناقل الحركة","name_l1":"Transmission Type","attribute":"transmission","formattedValue":"اوتوماتيك","formattedValue_l1":"Automatic"}
        #extract part from the page between two patterns
        match = re.search('name_l1":"Brand', page_content)
        page_content_simplified = page_content[match.start():]
        matches = list(re.finditer('"formattedValue_l1"', page_content_simplified))
        last_match = matches[-1]
        page_content_simplified2 = page_content_simplified[:last_match.start()]
        #add {" to the start of string
        page_content_simplified2 = '"' + page_content_simplified2
        #print (page_content_simplified2)
        #split by },{
        page_content_simplified2_list = page_content_simplified2.split('},{')
        print (page_content_simplified2_list)
        #loop through all cars
        car_make = ""
        car_model = ""
        car_year = ""
        car_price = ""
        car_transmission = ""
        car_kilometers = ""
        car_fuel = ""
        for car in page_content_simplified2_list:
            #print (car)
            #"attribute":"make","formattedValue":"مرسيدس بنز","formattedValue_l1":"Mercedes-Benz"
            #get the value of attribute and formattedValue_l1
            attributes = re.findall('"attribute":"(.*?)"', car)
            #convert to string
            #attributes = ''.join(attributes)
            formattedValue_l1 = re.findall('"formattedValue_l1":"(.*?)"', car)
            #print (attributes)
            #print (formattedValue_l1)
            if 'make' in attributes:
                #save the info of last car if complete info
                if car_make != "" and car_model != "" and car_year != "" and car_price != "" and car_transmission != "" and car_kilometers != "" and car_fuel != "":
                    writer.writerow({'Make': car_make, 'Model': car_model, 'Year': car_year, 'Price': car_price, 'Transmission': car_transmission, 'Kilometers': car_kilometers, 'Fuel': car_fuel})
                #start new car saving
                car_make = formattedValue_l1[0]
                car_model = ""
                car_year = ""
                car_price = ""
                car_transmission = ""
                car_kilometers = ""
                car_fuel = ""
            elif 'year' in attributes:
                if formattedValue_l1 != []:
                    car_year = formattedValue_l1[0]
                else:
                    car_year = ""
            elif 'model' in attributes:
                if formattedValue_l1 != []:
                    car_model = formattedValue_l1[0]
                else:
                    car_model = ""
            elif 'price' in attributes:
                if formattedValue_l1 != []:
                    car_price = formattedValue_l1[0].replace(',', '')
                else:
                    car_price = ""
            elif 'petrol' in attributes:
                if formattedValue_l1 != []:
                    car_fuel = formattedValue_l1[0]
                else:
                    car_fuel = ""
            elif 'mileage' in attributes:
                if formattedValue_l1 != []:
                    car_kilometers = formattedValue_l1[0]
                else:
                    car_kilometers = ""
            elif 'transmission' in attributes:
                if formattedValue_l1 != []:
                    car_transmission = formattedValue_l1[0]
                else:
                    car_transmission = ""


scrapping page  2
['"name_l1":"Brand","attribute":"make","formattedValue":"مرسيدس بنز","formattedValue_l1":"Mercedes-Benz"', '"name":"السنة","name_l1":"Year","attribute":"year","formattedValue":"2023","formattedValue_l1":"2023"', '"name":"اللون","name_l1":"Color","attribute":"color","formattedValue":"أسود","formattedValue_l1":"Black"', '"name":"عدد الأبواب","name_l1":"Number of doors","attribute":"doors","formattedValue":"4\\u002F5","formattedValue_l1":"4\\u002F5"', '"name":"موديل","name_l1":"Model","attribute":"model","formattedValue":"سي 180","formattedValue_l1":"C180"', '"name":"السعر","name_l1":"Price","attribute":"price","formattedValue":"2,900,000","formattedValue_l1":"2,900,000"', '"name":"عدد المقاعد ","name_l1":"Number of seats","attribute":"seats","formattedValue":"5","formattedValue_l1":"5"', '"name":"فيديو","name_l1":"Video","attribute":"video","formattedValue":"غير متوفر","formattedValue_l1":"Not Available"', '"name":"نوع الوقود","name_l1":"Fuel Type","attribute":"petrol",

In [62]:
import requests
from bs4 import BeautifulSoup
import csv
from time import sleep
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry

total_counter = 0

# Function to create a session with retry mechanism
def create_session():
    session = requests.Session()
    retries = Retry(total=5, backoff_factor=1, status_forcelist=[500, 502, 503, 504])
    session.mount('https://', HTTPAdapter(max_retries=retries))
    session.mount('http://', HTTPAdapter(max_retries=retries))
    return session

# Open a CSV file in write mode
with open('hatla2ee_newcars_data1.csv', 'w', newline='', encoding='utf-8') as csvfile:
    # Define fieldnames for the CSV file
    fieldnames = ['Name', 'Min_Price', 'Max_Price']
    
    # Initialize a CSV writer object
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    
    # Write the header row
    writer.writeheader()
    
    session = create_session()
    name_list = []
    min_price_list = []
    max_price_list = []
    for i in range(1,18):
    #for i in range(1, 750):
    #for i in range(1, 1000):
        url = f"https://eg.hatla2ee.com/en/new-car/page/{i}"
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
        }

        try:
            # Send a GET request to the URL using the session
            response = session.get(url, headers=headers)
            response.raise_for_status()  # Raises HTTPError for bad responses

            # Parse the HTML content
            soup = BeautifulSoup(response.content, "html.parser")
            #print (soup)
            # Find all car cards
            car_cards = soup.find_all("div", class_="nCarList_contain")
            if not car_cards:
                break

            

            # Iterate over each car card
            for card in car_cards:
                #print (card)
#                <a class="nCarListData_title" href="/en/new-car/byd/f3">
#                                            Byd F3 2024                                         </a>
#</div>
#<span>Official Price</span>
#<strong>520,000 EGP</strong>
#<strong>600,000 EGP</strong>
                #looping over lines of card
                new_car_insertion = 0
                official_price = 0
                min_price_flag = 0
                #get lines of card into array lines
                lines = str(card).split('\n')
                for j in range(len(lines)):
                    #print (line)
                    if "nCarListData_title" in lines[j]:
                        car_name = lines[j+1].replace('</a>', '')
                        car_name = car_name.strip()
                        new_car_insertion = 1
                        official_price = 0
                        min_price_flag = 0
                        print (car_name)
                    elif  "Official Price" in lines[j]:
                        official_price = 1
                    elif "EGP" in lines[j] and official_price == 1 and min_price_flag == 0:
                        min_price = lines[j].split('>')[1].split('<')[0].replace(',', '').replace('EGP', '').strip()
                        min_price_flag = 1
                        print (min_price)
                    elif "EGP" in lines[j] and official_price == 1 and min_price_flag == 1:
                        max_price = lines[j].split('>')[1].split('<')[0].replace(',', '').replace('EGP', '').strip()
                        print (max_price)
                        name_list.append(car_name)
                        min_price_list.append(min_price)
                        max_price_list.append(max_price)
            print(f"***** Page {i} Scrapped Successfully  *****")
            sleep(5)
        except requests.RequestException as e:
            print(f"Error occurred while scraping page {i}: {e}")
            sleep(30)
    for i in range(len(name_list)):
        writer.writerow({'Name': name_list[i], 'Min_Price': min_price_list[i], 'Max_Price': max_price_list[i]})
                    

\
  
      





SYM Jet x 2024
SYM Fiddle 2 2024
Chery Arrizo 5 2024
630000
700000
715000
730000
Nissan Sunny 2024
695400
761000
Toyota Corolla 2025
1400000
1750000
1510000
1780000
Chevrolet Optra 2025
724900
749900
749900
774900
Byd F3 2024
520000
600000
Fiat Tipo 2025
1100000
1325000
Hyundai Elantra CN7 2025
1300000
1830000
Kia Sportage 2025
1789900
2289900
Chery Tiggo 3 2024
830000
Hyundai Elantra HD 2024
725000
890000
Lotus Emeya 2024
Lotus Eletre 2024
BMW 520 2024
BMW 530 2024
Jetour T2 2025
0
1895000
Audi RS Q8 2024
Rox 01 2025
Byd F3 2024
520000
600000
Suzuki S Presso 2024
549900
Chery Arrizo 5 2024
630000
700000
715000
730000
Byd F3 2025
650000
730000
Chery Arrizo 5 2025
665000
745000
700000
810000
Suzuki Swift Dzire 2024
669000
Hyundai Accent RB 2024
689000
810000
Nissan Sunny 2025
695000
795000
800000
815000
Nissan Sunny 2024
695400
761000
Suzuki Swift 2024
704900
724900
Chevrolet Optra 2025
724900
749900
749900
774900
Hyundai Elantra HD 2024
725000
890000
Suzuki Swift 2024 Facelift
744900
7