In [2]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re

# Define the base URL of the real estate website
base_url = 'https://www.voursa.com/Index.cfm?PN={}&gct=3&gv=13'

# Set the number of pages to scrape (modify as per your requirement)
num_pages = 15

# Initialize empty lists to store the scraped data
surface = []
description = []
additional_info = []
telephone_numbers = []
prices = []

# Define the keywords to filter by
keywords = ['Maison', 'Bâtiment', 'Villa']

# Iterate over each page
for page in range(1, num_pages + 1):
    # Construct the URL for the current page
    url = base_url.format(page)

    # Send an HTTP GET request to the website and retrieve the HTML content
    response = requests.get(url)
    html_content = response.content

    # Create a BeautifulSoup object to parse the HTML content
    soup = BeautifulSoup(html_content, 'html.parser')

    # Find all the anchor tags containing the URLs
    anchors = soup.find_all('a', href=True)

    # Extract the URLs of the real estate listings
    for anchor in anchors:
        href = anchor['href']
        if href.startswith('/annonces.cfm?pdtid='):
            listing_url = 'https://www.voursa.com' + href

            # Send an HTTP GET request to the real estate URL and retrieve the HTML content
            real_estate_response = requests.get(listing_url)
            real_estate_html = real_estate_response.content

            # Create a BeautifulSoup object to parse the real estate HTML content
            real_estate_soup = BeautifulSoup(real_estate_html, 'html.parser')

            # Extract the surface and description from the real estate page
            details_category = real_estate_soup.find('div', id='details_category')
            if details_category:
                surface_element = details_category.find('span', string='Surface(m²):')
                surface_value = surface_element.find_next_sibling('span', class_='valued').text.strip() if surface_element else 'N/A'
                description_element = real_estate_soup.find('div', id='pdesc')
                description_text = description_element.text.strip() if description_element else 'N/A'

                # Clean the description text
                description_text = description_text.replace('Description: \r\n', '')

                additional_info_element = real_estate_soup.find('span', class_='StylH2')
                additional_info_text = additional_info_element.get_text(strip=True) if additional_info_element else 'N/A'

                # Check if additional info contains any of the keywords
                if any(keyword in additional_info_text for keyword in keywords):
                    telephone_element = real_estate_soup.find('div', id='nom')
                    telephone_text = telephone_element.find('div', id='vendinfo').text.strip() if telephone_element else 'N/A'

                    # Extract telephone number using regular expressions
                    telephone_number = re.search(r'Téléphones:\s+(\d+ \d+ \d+ \d+)', telephone_text)
                    telephone_number = telephone_number.group(1) if telephone_number else 'N/A'

                    price_element = real_estate_soup.find('div', id='prix')
                    price_text = price_element.text.strip() if price_element else 'N/A'

                    surface.append(surface_value)
                    description.append(description_text)
                    additional_info.append(additional_info_text)
                    telephone_numbers.append(telephone_number)
                    prices.append(price_text)
            else:
                surface_value = 'N/A'
                description_text = 'N/A'
                additional_info_text = 'N/A'
                telephone_number = 'N/A'
                price_text = 'N/A'

# Create a DataFrame from the scraped data
data = {
    'Surface': surface,
    'Description': description,
    'Additional Info': additional_info,
    'Telephone Number': telephone_numbers,
    'Price': prices
}
df = pd.DataFrame(data)

# Save the DataFrame to a CSV file
# df.to_csv('real_estate_data.csv', index=False, encoding='utf-8-sig')

# Print the DataFrame
df.head()


Unnamed: 0,Surface,Description,Additional Info,Telephone Number,Price
0,,فيلا في تفرغ زينة راقية و مؤثثة اثاث راقي ...,Maison - Villa,41 30 60 60,prix indéterminé
1,,منزله عماره زينه ف داره البركه اللبيع ... ...,Bâtiment,36 00 01 51,"17,000,000 MRU"
2,,A vendre une villa neuf tres haut standing...,Maison - Villa,36 68 43 68,prix indéterminé
3,,فرصة دار أف الإتحادية مونطر فيه بيت 5■4 ...,Maison - Villa,36 41 70 43,"350,000 MRU"
4,,سمعة جديدة للبيع في تفرغ زينة بالقرب من أت...,Maison - Villa,22 36 64 31,"78,000,000 MRU"


In [3]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re

# Define the base URL of the real estate website
base_url = 'https://www.voursa.com/Index.cfm?PN={}&gct=3&gv=13'

# Set the number of pages to scrape (modify as per your requirement)
num_pages = 30

# Initialize empty lists to store the scraped data
surface = []
description = []
additional_info = []
telephone_numbers = []
prices = []

# Define the keywords to filter by
keywords = ['Maison', 'Bâtiment', 'Villa']

# Iterate over each page
for page in range(16, num_pages + 1):
    # Construct the URL for the current page
    url = base_url.format(page)

    # Send an HTTP GET request to the website and retrieve the HTML content
    response = requests.get(url)
    html_content = response.content

    # Create a BeautifulSoup object to parse the HTML content
    soup = BeautifulSoup(html_content, 'html.parser')

    # Find all the anchor tags containing the URLs
    anchors = soup.find_all('a', href=True)

    # Extract the URLs of the real estate listings
    for anchor in anchors:
        href = anchor['href']
        if href.startswith('/annonces.cfm?pdtid='):
            listing_url = 'https://www.voursa.com' + href

            # Send an HTTP GET request to the real estate URL and retrieve the HTML content
            real_estate_response = requests.get(listing_url)
            real_estate_html = real_estate_response.content

            # Create a BeautifulSoup object to parse the real estate HTML content
            real_estate_soup = BeautifulSoup(real_estate_html, 'html.parser')

            # Extract the surface and description from the real estate page
            details_category = real_estate_soup.find('div', id='details_category')
            if details_category:
                surface_element = details_category.find('span', string='Surface(m²):')
                surface_value = surface_element.find_next_sibling('span', class_='valued').text.strip() if surface_element else 'N/A'
                description_element = real_estate_soup.find('div', id='pdesc')
                description_text = description_element.text.strip() if description_element else 'N/A'

                # Clean the description text
                description_text = description_text.replace('Description: \r\n', '')

                additional_info_element = real_estate_soup.find('span', class_='StylH2')
                additional_info_text = additional_info_element.get_text(strip=True) if additional_info_element else 'N/A'

                # Check if additional info contains any of the keywords
                if any(keyword in additional_info_text for keyword in keywords):
                    telephone_element = real_estate_soup.find('div', id='nom')
                    telephone_text = telephone_element.find('div', id='vendinfo').text.strip() if telephone_element else 'N/A'

                    # Extract telephone number using regular expressions
                    telephone_number = re.search(r'Téléphones:\s+(\d+ \d+ \d+ \d+)', telephone_text)
                    telephone_number = telephone_number.group(1) if telephone_number else 'N/A'

                    price_element = real_estate_soup.find('div', id='prix')
                    price_text = price_element.text.strip() if price_element else 'N/A'

                    surface.append(surface_value)
                    description.append(description_text)
                    additional_info.append(additional_info_text)
                    telephone_numbers.append(telephone_number)
                    prices.append(price_text)
            else:
                surface_value = 'N/A'
                description_text = 'N/A'
                additional_info_text = 'N/A'
                telephone_number = 'N/A'
                price_text = 'N/A'

# Create a DataFrame from the scraped data
data = {
    'Surface': surface,
    'Description': description,
    'Additional Info': additional_info,
    'Telephone Number': telephone_numbers,
    'Price': prices
}
df1 = pd.DataFrame(data)

# Save the DataFrame to a CSV file
# df.to_csv('real_estate_data.csv', index=False, encoding='utf-8-sig')

# Print the DataFrame
df1.head()


Unnamed: 0,Surface,Description,Additional Info,Telephone Number,Price
0,,فيلا في تفرغ زينة راقية و مؤثثة اثاث راقي ...,Maison - Villa,41 30 60 60,prix indéterminé
1,,A vendre une villa neuf tres haut standing...,Maison - Villa,36 68 43 68,prix indéterminé
2,,منزله عماره زينه ف داره البركه اللبيع ... ...,Bâtiment,36 00 01 51,"17,000,000 MRU"
3,,السلام عليكم عندن ذيك دار فسكوجيم عين طلح ...,Maison - Villa,37 44 50 56,"1,400,000 MRU"
4,,منزل بطابقين و مساحة 350 م2 متموقع في سيتي...,Maison - Villa,46 78 70 51,prix indéterminé


In [4]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re

# Define the base URL of the real estate website
base_url = 'https://www.voursa.com/Index.cfm?PN={}&gct=3&gv=13'

# Set the number of pages to scrape (modify as per your requirement)
num_pages = 45

# Initialize empty lists to store the scraped data
surface = []
description = []
additional_info = []
telephone_numbers = []
prices = []

# Define the keywords to filter by
keywords = ['Maison', 'Bâtiment', 'Villa']

# Iterate over each page
for page in range(31, num_pages + 1):
    # Construct the URL for the current page
    url = base_url.format(page)

    # Send an HTTP GET request to the website and retrieve the HTML content
    response = requests.get(url)
    html_content = response.content

    # Create a BeautifulSoup object to parse the HTML content
    soup = BeautifulSoup(html_content, 'html.parser')

    # Find all the anchor tags containing the URLs
    anchors = soup.find_all('a', href=True)

    # Extract the URLs of the real estate listings
    for anchor in anchors:
        href = anchor['href']
        if href.startswith('/annonces.cfm?pdtid='):
            listing_url = 'https://www.voursa.com' + href

            # Send an HTTP GET request to the real estate URL and retrieve the HTML content
            real_estate_response = requests.get(listing_url)
            real_estate_html = real_estate_response.content

            # Create a BeautifulSoup object to parse the real estate HTML content
            real_estate_soup = BeautifulSoup(real_estate_html, 'html.parser')

            # Extract the surface and description from the real estate page
            details_category = real_estate_soup.find('div', id='details_category')
            if details_category:
                surface_element = details_category.find('span', string='Surface(m²):')
                surface_value = surface_element.find_next_sibling('span', class_='valued').text.strip() if surface_element else 'N/A'
                description_element = real_estate_soup.find('div', id='pdesc')
                description_text = description_element.text.strip() if description_element else 'N/A'

                # Clean the description text
                description_text = description_text.replace('Description: \r\n', '')

                additional_info_element = real_estate_soup.find('span', class_='StylH2')
                additional_info_text = additional_info_element.get_text(strip=True) if additional_info_element else 'N/A'

                # Check if additional info contains any of the keywords
                if any(keyword in additional_info_text for keyword in keywords):
                    telephone_element = real_estate_soup.find('div', id='nom')
                    telephone_text = telephone_element.find('div', id='vendinfo').text.strip() if telephone_element else 'N/A'

                    # Extract telephone number using regular expressions
                    telephone_number = re.search(r'Téléphones:\s+(\d+ \d+ \d+ \d+)', telephone_text)
                    telephone_number = telephone_number.group(1) if telephone_number else 'N/A'

                    price_element = real_estate_soup.find('div', id='prix')
                    price_text = price_element.text.strip() if price_element else 'N/A'

                    surface.append(surface_value)
                    description.append(description_text)
                    additional_info.append(additional_info_text)
                    telephone_numbers.append(telephone_number)
                    prices.append(price_text)
            else:
                surface_value = 'N/A'
                description_text = 'N/A'
                additional_info_text = 'N/A'
                telephone_number = 'N/A'
                price_text = 'N/A'

# Create a DataFrame from the scraped data
data = {
    'Surface': surface,
    'Description': description,
    'Additional Info': additional_info,
    'Telephone Number': telephone_numbers,
    'Price': prices
}
df2 = pd.DataFrame(data)

# Save the DataFrame to a CSV file
# df.to_csv('real_estate_data.csv', index=False, encoding='utf-8-sig')

# Print the DataFrame
df2.head()

Unnamed: 0,Surface,Description,Additional Info,Telephone Number,Price
0,,فيلا في تفرغ زينة راقية و مؤثثة اثاث راقي ...,Maison - Villa,41 30 60 60,prix indéterminé
1,,A vendre une villa neuf tres haut standing...,Maison - Villa,36 68 43 68,prix indéterminé
2,,منزله عماره زينه ف داره البركه اللبيع ... ...,Bâtiment,36 00 01 51,"17,000,000 MRU"
3,,منزل في بوحديدة نيمرو13/16اعل كدروه تيو بي...,Maison - Villa,41 41 12 63,13 MRU
4,,منزل جديده بالقرب من دبو المهدي فاتح على ...,Maison - Villa,37 44 50 56,"15,000,000 MRU"


In [12]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re

# Define the base URL of the real estate website
base_url = 'https://www.voursa.com/Index.cfm?PN={}&gct=3&gv=13'

# Set the number of pages to scrape (modify as per your requirement)
num_pages = 60

# Initialize empty lists to store the scraped data
surface = []
description = []
additional_info = []
telephone_numbers = []
prices = []

# Define the keywords to filter by
keywords = ['Maison', 'Bâtiment', 'Villa']

# Iterate over each page
for page in range(46, num_pages + 1):
    # Construct the URL for the current page
    url = base_url.format(page)

    # Send an HTTP GET request to the website and retrieve the HTML content
    response = requests.get(url)
    html_content = response.content

    # Create a BeautifulSoup object to parse the HTML content
    soup = BeautifulSoup(html_content, 'html.parser')

    # Find all the anchor tags containing the URLs
    anchors = soup.find_all('a', href=True)

    # Extract the URLs of the real estate listings
    for anchor in anchors:
        href = anchor['href']
        if href.startswith('/annonces.cfm?pdtid='):
            listing_url = 'https://www.voursa.com' + href

            # Send an HTTP GET request to the real estate URL and retrieve the HTML content
            real_estate_response = requests.get(listing_url)
            real_estate_html = real_estate_response.content

            # Create a BeautifulSoup object to parse the real estate HTML content
            real_estate_soup = BeautifulSoup(real_estate_html, 'html.parser')

            # Extract the surface and description from the real estate page
            details_category = real_estate_soup.find('div', id='details_category')
            if details_category:
                surface_element = details_category.find('span', string='Surface(m²):')
                surface_value = surface_element.find_next_sibling('span', class_='valued').text.strip() if surface_element else 'N/A'
                description_element = real_estate_soup.find('div', id='pdesc')
                description_text = description_element.text.strip() if description_element else 'N/A'

                # Clean the description text
                description_text = description_text.replace('Description: \r\n', '')

                additional_info_element = real_estate_soup.find('span', class_='StylH2')
                additional_info_text = additional_info_element.get_text(strip=True) if additional_info_element else 'N/A'

                # Check if additional info contains any of the keywords
                if any(keyword in additional_info_text for keyword in keywords):
                    telephone_element = real_estate_soup.find('div', id='nom')
                    telephone_text = telephone_element.find('div', id='vendinfo').text.strip() if telephone_element else 'N/A'

                    # Extract telephone number using regular expressions
                    telephone_number = re.search(r'Téléphones:\s+(\d+ \d+ \d+ \d+)', telephone_text)
                    telephone_number = telephone_number.group(1) if telephone_number else 'N/A'

                    price_element = real_estate_soup.find('div', id='prix')
                    price_text = price_element.text.strip() if price_element else 'N/A'

                    surface.append(surface_value)
                    description.append(description_text)
                    additional_info.append(additional_info_text)
                    telephone_numbers.append(telephone_number)
                    prices.append(price_text)
            else:
                surface_value = 'N/A'
                description_text = 'N/A'
                additional_info_text = 'N/A'
                telephone_number = 'N/A'
                price_text = 'N/A'

# Create a DataFrame from the scraped data
data = {
    'Surface': surface,
    'Description': description,
    'Additional Info': additional_info,
    'Telephone Number': telephone_numbers,
    'Price': prices
}
df3 = pd.DataFrame(data)

# Save the DataFrame to a CSV file
# df.to_csv('real_estate_data.csv', index=False, encoding='utf-8-sig')

# Print the DataFrame
df3.head(10)

Unnamed: 0,Surface,Description,Additional Info,Telephone Number,Price
0,,فيلا في تفرغ زينة راقية و مؤثثة اثاث راقي ...,Maison - Villa,41 30 60 60,prix indéterminé
1,,منزله عماره زينه ف داره البركه اللبيع ... ...,Bâtiment,36 00 01 51,"17,000,000 MRU"
2,,A vendre une villa neuf tres haut standing...,Maison - Villa,36 68 43 68,prix indéterminé
3,,فيلا للبيع في حي صكوك,Bâtiment,33 48 88 94,"21,000,000 MRU"
4,,سمعه زينه وجديده مافاتت اندخلت في سكوجيم ع...,Bâtiment,33 48 88 94,"35,000,000 MRU"
5,,دار في تيارت احذ مرصت سيدي,Maison - Villa,33 48 88 94,"7,000,000 MRU"
6,,فرصة نيمرو,Maison - Villa,38 11 97 47,"1,800,000 MRU"
7,,سمعه في الزعطر زينه فاتحه اعل شارع 12,Maison - Villa,42 50 88 93,"1,350,000 MRU"
8,,قطعة أرض قريب من من منزل لدور في سانتر امت...,Maison - Villa,36 68 91 11,"130,000 MRU"
9,,فيلا أرضية فيها 3غرف وجلسة فصحراوي مبيوعة ...,Maison - Villa,20 58 44 85,"35,000,000 MRU"


In [13]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re

# Define the base URL of the real estate website
base_url = 'https://www.voursa.com/Index.cfm?PN={}&gct=3&gv=13'

# Set the number of pages to scrape (modify as per your requirement)
num_pages = 75

# Initialize empty lists to store the scraped data
surface = []
description = []
additional_info = []
telephone_numbers = []
prices = []

# Define the keywords to filter by
keywords = ['Maison', 'Bâtiment', 'Villa']

# Iterate over each page
for page in range(61, num_pages + 1):
    # Construct the URL for the current page
    url = base_url.format(page)

    # Send an HTTP GET request to the website and retrieve the HTML content
    response = requests.get(url)
    html_content = response.content

    # Create a BeautifulSoup object to parse the HTML content
    soup = BeautifulSoup(html_content, 'html.parser')

    # Find all the anchor tags containing the URLs
    anchors = soup.find_all('a', href=True)

    # Extract the URLs of the real estate listings
    for anchor in anchors:
        href = anchor['href']
        if href.startswith('/annonces.cfm?pdtid='):
            listing_url = 'https://www.voursa.com' + href

            # Send an HTTP GET request to the real estate URL and retrieve the HTML content
            real_estate_response = requests.get(listing_url)
            real_estate_html = real_estate_response.content

            # Create a BeautifulSoup object to parse the real estate HTML content
            real_estate_soup = BeautifulSoup(real_estate_html, 'html.parser')

            # Extract the surface and description from the real estate page
            details_category = real_estate_soup.find('div', id='details_category')
            if details_category:
                surface_element = details_category.find('span', string='Surface(m²):')
                surface_value = surface_element.find_next_sibling('span', class_='valued').text.strip() if surface_element else 'N/A'
                description_element = real_estate_soup.find('div', id='pdesc')
                description_text = description_element.text.strip() if description_element else 'N/A'

                # Clean the description text
                description_text = description_text.replace('Description: \r\n', '')

                additional_info_element = real_estate_soup.find('span', class_='StylH2')
                additional_info_text = additional_info_element.get_text(strip=True) if additional_info_element else 'N/A'

                # Check if additional info contains any of the keywords
                if any(keyword in additional_info_text for keyword in keywords):
                    telephone_element = real_estate_soup.find('div', id='nom')
                    telephone_text = telephone_element.find('div', id='vendinfo').text.strip() if telephone_element else 'N/A'

                    # Extract telephone number using regular expressions
                    telephone_number = re.search(r'Téléphones:\s+(\d+ \d+ \d+ \d+)', telephone_text)
                    telephone_number = telephone_number.group(1) if telephone_number else 'N/A'

                    price_element = real_estate_soup.find('div', id='prix')
                    price_text = price_element.text.strip() if price_element else 'N/A'

                    surface.append(surface_value)
                    description.append(description_text)
                    additional_info.append(additional_info_text)
                    telephone_numbers.append(telephone_number)
                    prices.append(price_text)
            else:
                surface_value = 'N/A'
                description_text = 'N/A'
                additional_info_text = 'N/A'
                telephone_number = 'N/A'
                price_text = 'N/A'

# Create a DataFrame from the scraped data
data = {
    'Surface': surface,
    'Description': description,
    'Additional Info': additional_info,
    'Telephone Number': telephone_numbers,
    'Price': prices
}
df4 = pd.DataFrame(data)

# Save the DataFrame to a CSV file
# df.to_csv('real_estate_data.csv', index=False, encoding='utf-8-sig')

# Print the DataFrame
df4.head(10)

Unnamed: 0,Surface,Description,Additional Info,Telephone Number,Price
0,,فيلا في تفرغ زينة راقية و مؤثثة اثاث راقي ...,Maison - Villa,41 30 60 60,prix indéterminé
1,,A vendre une villa neuf tres haut standing...,Maison - Villa,36 68 43 68,prix indéterminé
2,,منزله عماره زينه ف داره البركه اللبيع ... ...,Bâtiment,36 00 01 51,"17,000,000 MRU"
3,,سمعة في الترحيل على شارع اكبير مبيوعه با14...,Maison - Villa,26 68 68 69,"1,400,000 MRU"
4,,سمعة نظيفة للبيع فاتح شارع اكبيرالموقع :كر...,Maison - Villa,22 60 80 90,"1,500,000 MRU"
5,,منزل نظيف للبيع فاتح اعل بارك الموقع : بو...,Maison - Villa,22 60 80 90,"1,200,000 MRU"
6,,السلام عليكم هذي دار في ترحيل دبي ميوعه 5م...,Maison - Villa,31 06 45 08,"550,000 MRU"
7,,السلام عليكم هذي دار فمدال D بالقرب من كدر...,Maison - Villa,31 06 45 08,"4,500,000 MRU"
8,,السلام عليكم هذ سمعه اعلي كدروه كوىنا في ا...,Maison - Villa,31 06 45 08,"3,800,000 MRU"
9,,السلام عليكم هذي دار في لاص بلماس بالقرب م...,Maison - Villa,31 06 45 08,"5,000,000 MRU"


In [16]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re

# Define the base URL of the real estate website
base_url = 'https://www.voursa.com/Index.cfm?PN={}&gct=3&gv=13'

# Set the number of pages to scrape (modify as per your requirement)
num_pages = 90

# Initialize empty lists to store the scraped data
surface = []
description = []
additional_info = []
telephone_numbers = []
prices = []

# Define the keywords to filter by
keywords = ['Maison', 'Bâtiment', 'Villa']

# Iterate over each page
for page in range(76, num_pages + 1):
    # Construct the URL for the current page
    url = base_url.format(page)

    # Send an HTTP GET request to the website and retrieve the HTML content
    response = requests.get(url)
    html_content = response.content

    # Create a BeautifulSoup object to parse the HTML content
    soup = BeautifulSoup(html_content, 'html.parser')

    # Find all the anchor tags containing the URLs
    anchors = soup.find_all('a', href=True)

    # Extract the URLs of the real estate listings
    for anchor in anchors:
        href = anchor['href']
        if href.startswith('/annonces.cfm?pdtid='):
            listing_url = 'https://www.voursa.com' + href

            # Send an HTTP GET request to the real estate URL and retrieve the HTML content
            real_estate_response = requests.get(listing_url)
            real_estate_html = real_estate_response.content

            # Create a BeautifulSoup object to parse the real estate HTML content
            real_estate_soup = BeautifulSoup(real_estate_html, 'html.parser')

            # Extract the surface and description from the real estate page
            details_category = real_estate_soup.find('div', id='details_category')
            if details_category:
                surface_element = details_category.find('span', string='Surface(m²):')
                surface_value = surface_element.find_next_sibling('span', class_='valued').text.strip() if surface_element else 'N/A'
                description_element = real_estate_soup.find('div', id='pdesc')
                description_text = description_element.text.strip() if description_element else 'N/A'

                # Clean the description text
                description_text = description_text.replace('Description: \r\n', '')

                additional_info_element = real_estate_soup.find('span', class_='StylH2')
                additional_info_text = additional_info_element.get_text(strip=True) if additional_info_element else 'N/A'

                # Check if additional info contains any of the keywords
                if any(keyword in additional_info_text for keyword in keywords):
                    telephone_element = real_estate_soup.find('div', id='nom')
                    telephone_text = telephone_element.find('div', id='vendinfo').text.strip() if telephone_element else 'N/A'

                    # Extract telephone number using regular expressions
                    telephone_number = re.search(r'Téléphones:\s+(\d+ \d+ \d+ \d+)', telephone_text)
                    telephone_number = telephone_number.group(1) if telephone_number else 'N/A'

                    price_element = real_estate_soup.find('div', id='prix')
                    price_text = price_element.text.strip() if price_element else 'N/A'

                    surface.append(surface_value)
                    description.append(description_text)
                    additional_info.append(additional_info_text)
                    telephone_numbers.append(telephone_number)
                    prices.append(price_text)
            else:
                surface_value = 'N/A'
                description_text = 'N/A'
                additional_info_text = 'N/A'
                telephone_number = 'N/A'
                price_text = 'N/A'

# Create a DataFrame from the scraped data
data = {
    'Surface': surface,
    'Description': description,
    'Additional Info': additional_info,
    'Telephone Number': telephone_numbers,
    'Price': prices
}
df5 = pd.DataFrame(data)

# Save the DataFrame to a CSV file
# df.to_csv('real_estate_data.csv', index=False, encoding='utf-8-sig')

# Print the DataFrame
df5.head(10)

Unnamed: 0,Surface,Description,Additional Info,Telephone Number,Price
0,,فيلا في تفرغ زينة راقية و مؤثثة اثاث راقي ...,Maison - Villa,41 30 60 60,prix indéterminé
1,,سمعة زين مشاء الله ونيمروها كبير وفلكَرن و...,Maison - Villa,42 05 30 73,"3,000,000 MRU"
2,,منزل جديد مافات اندخل فاتح اعل شارع 6 م ال...,Maison - Villa,22 60 80 90,"1,300,000 MRU"
3,,دار فاتحة على شارع كبير و بينها مع كرن شار...,Maison - Villa,20 70 88 88,"2,500,000 MRU"
4,,منزل ف كرفور مساحته 180 فاتح اعل شارع اكبي...,Maison - Villa,36 30 64 59,"2,200,000 MRU"
5,,دار للبيع أف عين الطلح كدروه الطينطان خاصت...,Maison - Villa,44 80 30 00,"1,100,000 MRU"
6,,زين ماشاء الله,Maison - Villa,49 29 92 74,"1,700,000 MRU"
7,,فيلا للبيع في تفرغ زينة الصحراوي مساحتها 5...,Maison - Villa,22 36 64 31,"65,000,000 MRU"
8,,دار للبيع أف عين الطلح كدروه الطينطان خاصت...,Maison - Villa,27 83 63 16,"1,100,000 MRU"
9,,عندنا دار فعين طلح 3غرف وجلسة حذا گدروه طي...,Maison - Villa,20 58 44 85,"12,000,000 MRU"


In [17]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re

# Define the base URL of the real estate website
base_url = 'https://www.voursa.com/Index.cfm?PN={}&gct=3&gv=13'

# Set the number of pages to scrape (modify as per your requirement)
num_pages = 105

# Initialize empty lists to store the scraped data
surface = []
description = []
additional_info = []
telephone_numbers = []
prices = []

# Define the keywords to filter by
keywords = ['Maison', 'Bâtiment', 'Villa']

# Iterate over each page
for page in range(91, num_pages + 1):
    # Construct the URL for the current page
    url = base_url.format(page)

    # Send an HTTP GET request to the website and retrieve the HTML content
    response = requests.get(url)
    html_content = response.content

    # Create a BeautifulSoup object to parse the HTML content
    soup = BeautifulSoup(html_content, 'html.parser')

    # Find all the anchor tags containing the URLs
    anchors = soup.find_all('a', href=True)

    # Extract the URLs of the real estate listings
    for anchor in anchors:
        href = anchor['href']
        if href.startswith('/annonces.cfm?pdtid='):
            listing_url = 'https://www.voursa.com' + href

            # Send an HTTP GET request to the real estate URL and retrieve the HTML content
            real_estate_response = requests.get(listing_url)
            real_estate_html = real_estate_response.content

            # Create a BeautifulSoup object to parse the real estate HTML content
            real_estate_soup = BeautifulSoup(real_estate_html, 'html.parser')

            # Extract the surface and description from the real estate page
            details_category = real_estate_soup.find('div', id='details_category')
            if details_category:
                surface_element = details_category.find('span', string='Surface(m²):')
                surface_value = surface_element.find_next_sibling('span', class_='valued').text.strip() if surface_element else 'N/A'
                description_element = real_estate_soup.find('div', id='pdesc')
                description_text = description_element.text.strip() if description_element else 'N/A'

                # Clean the description text
                description_text = description_text.replace('Description: \r\n', '')

                additional_info_element = real_estate_soup.find('span', class_='StylH2')
                additional_info_text = additional_info_element.get_text(strip=True) if additional_info_element else 'N/A'

                # Check if additional info contains any of the keywords
                if any(keyword in additional_info_text for keyword in keywords):
                    telephone_element = real_estate_soup.find('div', id='nom')
                    telephone_text = telephone_element.find('div', id='vendinfo').text.strip() if telephone_element else 'N/A'

                    # Extract telephone number using regular expressions
                    telephone_number = re.search(r'Téléphones:\s+(\d+ \d+ \d+ \d+)', telephone_text)
                    telephone_number = telephone_number.group(1) if telephone_number else 'N/A'

                    price_element = real_estate_soup.find('div', id='prix')
                    price_text = price_element.text.strip() if price_element else 'N/A'

                    surface.append(surface_value)
                    description.append(description_text)
                    additional_info.append(additional_info_text)
                    telephone_numbers.append(telephone_number)
                    prices.append(price_text)
            else:
                surface_value = 'N/A'
                description_text = 'N/A'
                additional_info_text = 'N/A'
                telephone_number = 'N/A'
                price_text = 'N/A'

# Create a DataFrame from the scraped data
data = {
    'Surface': surface,
    'Description': description,
    'Additional Info': additional_info,
    'Telephone Number': telephone_numbers,
    'Price': prices
}
df6 = pd.DataFrame(data)

# Save the DataFrame to a CSV file
# df.to_csv('real_estate_data.csv', index=False, encoding='utf-8-sig')

# Print the DataFrame
df6.head(10)

Unnamed: 0,Surface,Description,Additional Info,Telephone Number,Price
0,,A vendre une villa neuf tres haut standing...,Maison - Villa,36 68 43 68,prix indéterminé
1,,فيلا في تفرغ زينة راقية و مؤثثة اثاث راقي ...,Maison - Villa,41 30 60 60,prix indéterminé
2,,منزله عماره زينه ف داره البركه اللبيع ... ...,Bâtiment,36 00 01 51,"17,000,000 MRU"
3,,السلام عليكمعندي دار فاتحه اعل اطلي الي ك...,Maison - Villa,33 42 66 66,"950,000 MRU"
4,,دار نظيفة جدا تبارك الله,Maison - Villa,49 29 92 74,"3,000,000 MRU"
5,,موقعها نظيف وفاتح في برك وگريب من كدروه,Maison - Villa,49 29 92 74,"3,000,000 MRU"
6,,فرصة ابرتماهات مبنيين بناي مظبوط فكرن وفات...,Bâtiment,42 50 88 93,"2,400,000 MRU"
7,,فندق وصال للبيعالموقع :تفرغ زين بالقرب من ...,Bâtiment,22 60 80 90,"75,000,000 MRU"
8,,السلام عليكمعندي دار في الحي الساكنه (شانت...,Maison - Villa,33 42 66 66,"470,000 MRU"
9,,منزل نظيفة للبيع فاتح اعل شارع اكبيرالموقع...,Maison - Villa,22 60 80 90,"1,550,000 MRU"


In [18]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re

# Define the base URL of the real estate website
base_url = 'https://www.voursa.com/Index.cfm?PN={}&gct=3&gv=13'

# Set the number of pages to scrape (modify as per your requirement)
num_pages = 120

# Initialize empty lists to store the scraped data
surface = []
description = []
additional_info = []
telephone_numbers = []
prices = []

# Define the keywords to filter by
keywords = ['Maison', 'Bâtiment', 'Villa']

# Iterate over each page
for page in range(106, num_pages + 1):
    # Construct the URL for the current page
    url = base_url.format(page)

    # Send an HTTP GET request to the website and retrieve the HTML content
    response = requests.get(url)
    html_content = response.content

    # Create a BeautifulSoup object to parse the HTML content
    soup = BeautifulSoup(html_content, 'html.parser')

    # Find all the anchor tags containing the URLs
    anchors = soup.find_all('a', href=True)

    # Extract the URLs of the real estate listings
    for anchor in anchors:
        href = anchor['href']
        if href.startswith('/annonces.cfm?pdtid='):
            listing_url = 'https://www.voursa.com' + href

            # Send an HTTP GET request to the real estate URL and retrieve the HTML content
            real_estate_response = requests.get(listing_url)
            real_estate_html = real_estate_response.content

            # Create a BeautifulSoup object to parse the real estate HTML content
            real_estate_soup = BeautifulSoup(real_estate_html, 'html.parser')

            # Extract the surface and description from the real estate page
            details_category = real_estate_soup.find('div', id='details_category')
            if details_category:
                surface_element = details_category.find('span', string='Surface(m²):')
                surface_value = surface_element.find_next_sibling('span', class_='valued').text.strip() if surface_element else 'N/A'
                description_element = real_estate_soup.find('div', id='pdesc')
                description_text = description_element.text.strip() if description_element else 'N/A'

                # Clean the description text
                description_text = description_text.replace('Description: \r\n', '')

                additional_info_element = real_estate_soup.find('span', class_='StylH2')
                additional_info_text = additional_info_element.get_text(strip=True) if additional_info_element else 'N/A'

                # Check if additional info contains any of the keywords
                if any(keyword in additional_info_text for keyword in keywords):
                    telephone_element = real_estate_soup.find('div', id='nom')
                    telephone_text = telephone_element.find('div', id='vendinfo').text.strip() if telephone_element else 'N/A'

                    # Extract telephone number using regular expressions
                    telephone_number = re.search(r'Téléphones:\s+(\d+ \d+ \d+ \d+)', telephone_text)
                    telephone_number = telephone_number.group(1) if telephone_number else 'N/A'

                    price_element = real_estate_soup.find('div', id='prix')
                    price_text = price_element.text.strip() if price_element else 'N/A'

                    surface.append(surface_value)
                    description.append(description_text)
                    additional_info.append(additional_info_text)
                    telephone_numbers.append(telephone_number)
                    prices.append(price_text)
            else:
                surface_value = 'N/A'
                description_text = 'N/A'
                additional_info_text = 'N/A'
                telephone_number = 'N/A'
                price_text = 'N/A'

# Create a DataFrame from the scraped data
data = {
    'Surface': surface,
    'Description': description,
    'Additional Info': additional_info,
    'Telephone Number': telephone_numbers,
    'Price': prices
}
df7 = pd.DataFrame(data)

# Save the DataFrame to a CSV file
# df.to_csv('real_estate_data.csv', index=False, encoding='utf-8-sig')

# Print the DataFrame
df7.head(10)

Unnamed: 0,Surface,Description,Additional Info,Telephone Number,Price
0,,A vendre une villa neuf tres haut standing...,Maison - Villa,36 68 43 68,prix indéterminé
1,,منزله عماره زينه ف داره البركه اللبيع ... ...,Bâtiment,36 00 01 51,"17,000,000 MRU"
2,,فيلا في تفرغ زينة راقية و مؤثثة اثاث راقي ...,Maison - Villa,41 30 60 60,prix indéterminé
3,,السلام عليكمعندنا فيلا أرضية جديدة مافات ن...,Maison - Villa,47 68 78 88,"2,500,000 MRU"
4,,زينه ماشاءالله وشارعها كبير ونظيف,Maison - Villa,49 29 92 74,"1,800,000 MRU"
5,,وسط البنيان وكريب من كدروه وزين ماشاءالله,Maison - Villa,49 29 92 74,"1,550,000 MRU"
6,,زينه تبارك الله وهينه,Maison - Villa,49 29 92 74,"1,900,000 MRU"
7,,العرض هو عبارة عن دار في المشروع في مكان خ...,Maison - Villa,37 30 00 58,"14,000,000 MRU"
8,,العرض هو عبارة عن دار في المشروع في مكان خ...,Maison - Villa,37 30 00 58,"14,000,000 MRU"
9,,السلام عليكمعندنا فيلا أرضية جديدة مافات ن...,Maison - Villa,47 68 78 88,"2,500,000 MRU"


In [19]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re

# Define the base URL of the real estate website
base_url = 'https://www.voursa.com/Index.cfm?PN={}&gct=3&gv=13'

# Set the number of pages to scrape (modify as per your requirement)
num_pages = 135

# Initialize empty lists to store the scraped data
surface = []
description = []
additional_info = []
telephone_numbers = []
prices = []

# Define the keywords to filter by
keywords = ['Maison', 'Bâtiment', 'Villa']

# Iterate over each page
for page in range(121, num_pages + 1):
    # Construct the URL for the current page
    url = base_url.format(page)

    # Send an HTTP GET request to the website and retrieve the HTML content
    response = requests.get(url)
    html_content = response.content

    # Create a BeautifulSoup object to parse the HTML content
    soup = BeautifulSoup(html_content, 'html.parser')

    # Find all the anchor tags containing the URLs
    anchors = soup.find_all('a', href=True)

    # Extract the URLs of the real estate listings
    for anchor in anchors:
        href = anchor['href']
        if href.startswith('/annonces.cfm?pdtid='):
            listing_url = 'https://www.voursa.com' + href

            # Send an HTTP GET request to the real estate URL and retrieve the HTML content
            real_estate_response = requests.get(listing_url)
            real_estate_html = real_estate_response.content

            # Create a BeautifulSoup object to parse the real estate HTML content
            real_estate_soup = BeautifulSoup(real_estate_html, 'html.parser')

            # Extract the surface and description from the real estate page
            details_category = real_estate_soup.find('div', id='details_category')
            if details_category:
                surface_element = details_category.find('span', string='Surface(m²):')
                surface_value = surface_element.find_next_sibling('span', class_='valued').text.strip() if surface_element else 'N/A'
                description_element = real_estate_soup.find('div', id='pdesc')
                description_text = description_element.text.strip() if description_element else 'N/A'

                # Clean the description text
                description_text = description_text.replace('Description: \r\n', '')

                additional_info_element = real_estate_soup.find('span', class_='StylH2')
                additional_info_text = additional_info_element.get_text(strip=True) if additional_info_element else 'N/A'

                # Check if additional info contains any of the keywords
                if any(keyword in additional_info_text for keyword in keywords):
                    telephone_element = real_estate_soup.find('div', id='nom')
                    telephone_text = telephone_element.find('div', id='vendinfo').text.strip() if telephone_element else 'N/A'

                    # Extract telephone number using regular expressions
                    telephone_number = re.search(r'Téléphones:\s+(\d+ \d+ \d+ \d+)', telephone_text)
                    telephone_number = telephone_number.group(1) if telephone_number else 'N/A'

                    price_element = real_estate_soup.find('div', id='prix')
                    price_text = price_element.text.strip() if price_element else 'N/A'

                    surface.append(surface_value)
                    description.append(description_text)
                    additional_info.append(additional_info_text)
                    telephone_numbers.append(telephone_number)
                    prices.append(price_text)
            else:
                surface_value = 'N/A'
                description_text = 'N/A'
                additional_info_text = 'N/A'
                telephone_number = 'N/A'
                price_text = 'N/A'

# Create a DataFrame from the scraped data
data = {
    'Surface': surface,
    'Description': description,
    'Additional Info': additional_info,
    'Telephone Number': telephone_numbers,
    'Price': prices
}
df8 = pd.DataFrame(data)

# Save the DataFrame to a CSV file
# df.to_csv('real_estate_data.csv', index=False, encoding='utf-8-sig')

# Print the DataFrame
df8.head(10)

Unnamed: 0,Surface,Description,Additional Info,Telephone Number,Price
0,,منزله عماره زينه ف داره البركه اللبيع ... ...,Bâtiment,36 00 01 51,"17,000,000 MRU"
1,,فيلا في تفرغ زينة راقية و مؤثثة اثاث راقي ...,Maison - Villa,41 30 60 60,prix indéterminé
2,,A vendre une villa neuf tres haut standing...,Maison - Villa,36 68 43 68,prix indéterminé
3,,فرصة اليوم لاتعوضدار للبيع فكرن فاالترحيل ...,Maison - Villa,38 57 18 08,"800,000 MRU"
4,,هاذ السمع زين والد حل 150شهريا السلام,Maison - Villa,32 20 38 00,"2,400,000 MRU"
5,,سمعة للبيع من درجة الأولي المنطقة صانتر أم...,Maison - Villa,43 24 20 04,"8,000,000 MRU"
6,,فرصة دار ماه واعر عند كارفور عزيز على گدرو...,Maison - Villa,43 42 45 45,prix indéterminé
7,,السلام عليكم هذي سمعه فتن سويلم فاتح في اب...,Maison - Villa,31 06 45 08,"2,200,000 MRU"
8,,السلام عليم هذي دار فسكجيم ول بو عمانو مسا...,Maison - Villa,31 06 45 08,"5,200,000 MRU"
9,,فرصة منزل استثماري للبيعسمعة اكبير فكرن فا...,Maison - Villa,38 57 18 08,prix indéterminé


In [20]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re

# Define the base URL of the real estate website
base_url = 'https://www.voursa.com/Index.cfm?PN={}&gct=3&gv=13'

# Set the number of pages to scrape (modify as per your requirement)
num_pages = 150

# Initialize empty lists to store the scraped data
surface = []
description = []
additional_info = []
telephone_numbers = []
prices = []

# Define the keywords to filter by
keywords = ['Maison', 'Bâtiment', 'Villa']

# Iterate over each page
for page in range(136, num_pages + 1):
    # Construct the URL for the current page
    url = base_url.format(page)

    # Send an HTTP GET request to the website and retrieve the HTML content
    response = requests.get(url)
    html_content = response.content

    # Create a BeautifulSoup object to parse the HTML content
    soup = BeautifulSoup(html_content, 'html.parser')

    # Find all the anchor tags containing the URLs
    anchors = soup.find_all('a', href=True)

    # Extract the URLs of the real estate listings
    for anchor in anchors:
        href = anchor['href']
        if href.startswith('/annonces.cfm?pdtid='):
            listing_url = 'https://www.voursa.com' + href

            # Send an HTTP GET request to the real estate URL and retrieve the HTML content
            real_estate_response = requests.get(listing_url)
            real_estate_html = real_estate_response.content

            # Create a BeautifulSoup object to parse the real estate HTML content
            real_estate_soup = BeautifulSoup(real_estate_html, 'html.parser')

            # Extract the surface and description from the real estate page
            details_category = real_estate_soup.find('div', id='details_category')
            if details_category:
                surface_element = details_category.find('span', string='Surface(m²):')
                surface_value = surface_element.find_next_sibling('span', class_='valued').text.strip() if surface_element else 'N/A'
                description_element = real_estate_soup.find('div', id='pdesc')
                description_text = description_element.text.strip() if description_element else 'N/A'

                # Clean the description text
                description_text = description_text.replace('Description: \r\n', '')

                additional_info_element = real_estate_soup.find('span', class_='StylH2')
                additional_info_text = additional_info_element.get_text(strip=True) if additional_info_element else 'N/A'

                # Check if additional info contains any of the keywords
                if any(keyword in additional_info_text for keyword in keywords):
                    telephone_element = real_estate_soup.find('div', id='nom')
                    telephone_text = telephone_element.find('div', id='vendinfo').text.strip() if telephone_element else 'N/A'

                    # Extract telephone number using regular expressions
                    telephone_number = re.search(r'Téléphones:\s+(\d+ \d+ \d+ \d+)', telephone_text)
                    telephone_number = telephone_number.group(1) if telephone_number else 'N/A'

                    price_element = real_estate_soup.find('div', id='prix')
                    price_text = price_element.text.strip() if price_element else 'N/A'

                    surface.append(surface_value)
                    description.append(description_text)
                    additional_info.append(additional_info_text)
                    telephone_numbers.append(telephone_number)
                    prices.append(price_text)
            else:
                surface_value = 'N/A'
                description_text = 'N/A'
                additional_info_text = 'N/A'
                telephone_number = 'N/A'
                price_text = 'N/A'

# Create a DataFrame from the scraped data
data = {
    'Surface': surface,
    'Description': description,
    'Additional Info': additional_info,
    'Telephone Number': telephone_numbers,
    'Price': prices
}
df9 = pd.DataFrame(data)

# Save the DataFrame to a CSV file
# df.to_csv('real_estate_data.csv', index=False, encoding='utf-8-sig')

# Print the DataFrame
df9.head(10)

Unnamed: 0,Surface,Description,Additional Info,Telephone Number,Price
0,,منزله عماره زينه ف داره البركه اللبيع ... ...,Bâtiment,36 00 01 51,"17,000,000 MRU"
1,,فيلا في تفرغ زينة راقية و مؤثثة اثاث راقي ...,Maison - Villa,41 30 60 60,prix indéterminé
2,,A vendre une villa neuf tres haut standing...,Maison - Villa,36 68 43 68,prix indéterminé
3,,دار زينة للبيع فاالترحيل 18 المساحة 159m ف...,Maison - Villa,38 57 18 08,"800,000 MRU"
4,,للمزيد من التفاصيل برجى الاتصال,Maison - Villa,38 38 90 00,"1,250,000 MRU"
5,,تتكون من صالون 12ميتر وجلسة 5متير و2كراج ...,Maison - Villa,22 24 11 72,"4,500,000 MRU"
6,,تتألف من صالون وغرفتين وغرفة نوم وجلسة زين...,Maison - Villa,22 24 11 72,"2,800,000 MRU"
7,,السلام عليكم هذي سمعة فرصة بالقرب من كافور...,Maison - Villa,31 06 45 08,"600,000 MRU"
8,,السلام عليكم هذي سمعه فكرفور بقالة الله اك...,Maison - Villa,31 06 45 08,"1,000,000 MRU"
9,,فرصة دار للبيع ف قندهار قرب شارع مشليه بين...,Maison - Villa,26 73 71 21,prix indéterminé


In [21]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re

# Define the base URL of the real estate website
base_url = 'https://www.voursa.com/Index.cfm?PN={}&gct=3&gv=13'

# Set the number of pages to scrape (modify as per your requirement)
num_pages = 165

# Initialize empty lists to store the scraped data
surface = []
description = []
additional_info = []
telephone_numbers = []
prices = []

# Define the keywords to filter by
keywords = ['Maison', 'Bâtiment', 'Villa']

# Iterate over each page
for page in range(151, num_pages + 1):
    # Construct the URL for the current page
    url = base_url.format(page)

    # Send an HTTP GET request to the website and retrieve the HTML content
    response = requests.get(url)
    html_content = response.content

    # Create a BeautifulSoup object to parse the HTML content
    soup = BeautifulSoup(html_content, 'html.parser')

    # Find all the anchor tags containing the URLs
    anchors = soup.find_all('a', href=True)

    # Extract the URLs of the real estate listings
    for anchor in anchors:
        href = anchor['href']
        if href.startswith('/annonces.cfm?pdtid='):
            listing_url = 'https://www.voursa.com' + href

            # Send an HTTP GET request to the real estate URL and retrieve the HTML content
            real_estate_response = requests.get(listing_url)
            real_estate_html = real_estate_response.content

            # Create a BeautifulSoup object to parse the real estate HTML content
            real_estate_soup = BeautifulSoup(real_estate_html, 'html.parser')

            # Extract the surface and description from the real estate page
            details_category = real_estate_soup.find('div', id='details_category')
            if details_category:
                surface_element = details_category.find('span', string='Surface(m²):')
                surface_value = surface_element.find_next_sibling('span', class_='valued').text.strip() if surface_element else 'N/A'
                description_element = real_estate_soup.find('div', id='pdesc')
                description_text = description_element.text.strip() if description_element else 'N/A'

                # Clean the description text
                description_text = description_text.replace('Description: \r\n', '')

                additional_info_element = real_estate_soup.find('span', class_='StylH2')
                additional_info_text = additional_info_element.get_text(strip=True) if additional_info_element else 'N/A'

                # Check if additional info contains any of the keywords
                if any(keyword in additional_info_text for keyword in keywords):
                    telephone_element = real_estate_soup.find('div', id='nom')
                    telephone_text = telephone_element.find('div', id='vendinfo').text.strip() if telephone_element else 'N/A'

                    # Extract telephone number using regular expressions
                    telephone_number = re.search(r'Téléphones:\s+(\d+ \d+ \d+ \d+)', telephone_text)
                    telephone_number = telephone_number.group(1) if telephone_number else 'N/A'

                    price_element = real_estate_soup.find('div', id='prix')
                    price_text = price_element.text.strip() if price_element else 'N/A'

                    surface.append(surface_value)
                    description.append(description_text)
                    additional_info.append(additional_info_text)
                    telephone_numbers.append(telephone_number)
                    prices.append(price_text)
            else:
                surface_value = 'N/A'
                description_text = 'N/A'
                additional_info_text = 'N/A'
                telephone_number = 'N/A'
                price_text = 'N/A'

# Create a DataFrame from the scraped data
data = {
    'Surface': surface,
    'Description': description,
    'Additional Info': additional_info,
    'Telephone Number': telephone_numbers,
    'Price': prices
}
df10 = pd.DataFrame(data)

# Save the DataFrame to a CSV file
# df.to_csv('real_estate_data.csv', index=False, encoding='utf-8-sig')

# Print the DataFrame
df10.head(10)

Unnamed: 0,Surface,Description,Additional Info,Telephone Number,Price
0,,A vendre une villa neuf tres haut standing...,Maison - Villa,36 68 43 68,prix indéterminé
1,,فيلا في تفرغ زينة راقية و مؤثثة اثاث راقي ...,Maison - Villa,41 30 60 60,prix indéterminé
2,,سمعة في عرفات ماه أبعيد من گدروه داي 15 أج...,Maison - Villa,27 82 81 31,"600,000 MRU"
3,,نيمرو ف بريمير امكيزيرة 3 فيه سكن كامل يتك...,Maison - Villa,48 79 49 93,"350,000 MRU"
4,,السلام عليكم هذي دار اعلي كدروه داي 11 اجد...,Maison - Villa,31 06 45 08,"1,400,000 MRU"
5,,السلام عليكم هذي سمعه فكرفور بالقرب من بقا...,Bâtiment,31 06 45 08,"2,000,000 MRU"
6,,السلام عليكم هذي دار فكرفور بالقرب من معهد...,Maison - Villa,31 06 45 08,"1,200,000 MRU"
7,,منزل ضخم قرب سيفارت فرنسا طابقين وفيه ابرت...,Maison - Villa,41 51 87 84,"8,000,000 MRU"
8,,منزلين في توجنين متلاصقين مواصفات جيدة سعر...,Maison - Villa,22 99 23 57,"900,000 MRU"
9,,السلام عليكم هذي دار فيها من فوك برتماهين ...,Maison - Villa,31 06 45 08,"3,300,000 MRU"


In [22]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re

# Define the base URL of the real estate website
base_url = 'https://www.voursa.com/Index.cfm?PN={}&gct=3&gv=13'

# Set the number of pages to scrape (modify as per your requirement)
num_pages = 180

# Initialize empty lists to store the scraped data
surface = []
description = []
additional_info = []
telephone_numbers = []
prices = []

# Define the keywords to filter by
keywords = ['Maison', 'Bâtiment', 'Villa']

# Iterate over each page
for page in range(166, num_pages + 1):
    # Construct the URL for the current page
    url = base_url.format(page)

    # Send an HTTP GET request to the website and retrieve the HTML content
    response = requests.get(url)
    html_content = response.content

    # Create a BeautifulSoup object to parse the HTML content
    soup = BeautifulSoup(html_content, 'html.parser')

    # Find all the anchor tags containing the URLs
    anchors = soup.find_all('a', href=True)

    # Extract the URLs of the real estate listings
    for anchor in anchors:
        href = anchor['href']
        if href.startswith('/annonces.cfm?pdtid='):
            listing_url = 'https://www.voursa.com' + href

            # Send an HTTP GET request to the real estate URL and retrieve the HTML content
            real_estate_response = requests.get(listing_url)
            real_estate_html = real_estate_response.content

            # Create a BeautifulSoup object to parse the real estate HTML content
            real_estate_soup = BeautifulSoup(real_estate_html, 'html.parser')

            # Extract the surface and description from the real estate page
            details_category = real_estate_soup.find('div', id='details_category')
            if details_category:
                surface_element = details_category.find('span', string='Surface(m²):')
                surface_value = surface_element.find_next_sibling('span', class_='valued').text.strip() if surface_element else 'N/A'
                description_element = real_estate_soup.find('div', id='pdesc')
                description_text = description_element.text.strip() if description_element else 'N/A'

                # Clean the description text
                description_text = description_text.replace('Description: \r\n', '')

                additional_info_element = real_estate_soup.find('span', class_='StylH2')
                additional_info_text = additional_info_element.get_text(strip=True) if additional_info_element else 'N/A'

                # Check if additional info contains any of the keywords
                if any(keyword in additional_info_text for keyword in keywords):
                    telephone_element = real_estate_soup.find('div', id='nom')
                    telephone_text = telephone_element.find('div', id='vendinfo').text.strip() if telephone_element else 'N/A'

                    # Extract telephone number using regular expressions
                    telephone_number = re.search(r'Téléphones:\s+(\d+ \d+ \d+ \d+)', telephone_text)
                    telephone_number = telephone_number.group(1) if telephone_number else 'N/A'

                    price_element = real_estate_soup.find('div', id='prix')
                    price_text = price_element.text.strip() if price_element else 'N/A'

                    surface.append(surface_value)
                    description.append(description_text)
                    additional_info.append(additional_info_text)
                    telephone_numbers.append(telephone_number)
                    prices.append(price_text)
            else:
                surface_value = 'N/A'
                description_text = 'N/A'
                additional_info_text = 'N/A'
                telephone_number = 'N/A'
                price_text = 'N/A'

# Create a DataFrame from the scraped data
data = {
    'Surface': surface,
    'Description': description,
    'Additional Info': additional_info,
    'Telephone Number': telephone_numbers,
    'Price': prices
}
df11 = pd.DataFrame(data)

# Save the DataFrame to a CSV file
# df.to_csv('real_estate_data.csv', index=False, encoding='utf-8-sig')

# Print the DataFrame
df11.head(10)

Unnamed: 0,Surface,Description,Additional Info,Telephone Number,Price
0,,A vendre une villa neuf tres haut standing...,Maison - Villa,36 68 43 68,prix indéterminé
1,,فيلا في تفرغ زينة راقية و مؤثثة اثاث راقي ...,Maison - Villa,41 30 60 60,prix indéterminé
2,,دار امع ابرتما صحراوي للبيعMaison + appar...,Maison - Villa,20 40 98 41,"6,200,000 MRU"
3,,السلام عليكم هذي نيمرو من h اسويت مساحة 30...,Maison - Villa,31 06 45 08,"840,000 MRU"
4,,دار وترابها كبيرة مشالله فتنويش ماهي واعرة...,Maison - Villa,34 94 41 39,prix indéterminé
5,,السلام عليكم هذي دار اعلي كدروه النائب فتو...,Maison - Villa,31 06 45 08,"1,200,000 MRU"
6,,فرصة دارزبن فعين الطلح بين واحد امع كدروه ...,Maison - Villa,38 57 18 08,"1,600,000 MRU"
7,,نصه مقطي والثاني لاحك لكواوة,Maison - Villa,48 38 90 00,"380,000 MRU"
8,,فرصة دار سمعة للبيع فتفرق زينة كريب من مور...,Maison - Villa,38 57 18 08,"6,500,000 MRU"
9,,فيلا معدله تعدال زين فاتح فبرك وقريب من ات...,Maison - Villa,36 28 99 35,prix indéterminé


In [23]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re

# Define the base URL of the real estate website
base_url = 'https://www.voursa.com/Index.cfm?PN={}&gct=3&gv=13'

# Set the number of pages to scrape (modify as per your requirement)
num_pages = 195

# Initialize empty lists to store the scraped data
surface = []
description = []
additional_info = []
telephone_numbers = []
prices = []

# Define the keywords to filter by
keywords = ['Maison', 'Bâtiment', 'Villa']

# Iterate over each page
for page in range(181, num_pages + 1):
    # Construct the URL for the current page
    url = base_url.format(page)

    # Send an HTTP GET request to the website and retrieve the HTML content
    response = requests.get(url)
    html_content = response.content

    # Create a BeautifulSoup object to parse the HTML content
    soup = BeautifulSoup(html_content, 'html.parser')

    # Find all the anchor tags containing the URLs
    anchors = soup.find_all('a', href=True)

    # Extract the URLs of the real estate listings
    for anchor in anchors:
        href = anchor['href']
        if href.startswith('/annonces.cfm?pdtid='):
            listing_url = 'https://www.voursa.com' + href

            # Send an HTTP GET request to the real estate URL and retrieve the HTML content
            real_estate_response = requests.get(listing_url)
            real_estate_html = real_estate_response.content

            # Create a BeautifulSoup object to parse the real estate HTML content
            real_estate_soup = BeautifulSoup(real_estate_html, 'html.parser')

            # Extract the surface and description from the real estate page
            details_category = real_estate_soup.find('div', id='details_category')
            if details_category:
                surface_element = details_category.find('span', string='Surface(m²):')
                surface_value = surface_element.find_next_sibling('span', class_='valued').text.strip() if surface_element else 'N/A'
                description_element = real_estate_soup.find('div', id='pdesc')
                description_text = description_element.text.strip() if description_element else 'N/A'

                # Clean the description text
                description_text = description_text.replace('Description: \r\n', '')

                additional_info_element = real_estate_soup.find('span', class_='StylH2')
                additional_info_text = additional_info_element.get_text(strip=True) if additional_info_element else 'N/A'

                # Check if additional info contains any of the keywords
                if any(keyword in additional_info_text for keyword in keywords):
                    telephone_element = real_estate_soup.find('div', id='nom')
                    telephone_text = telephone_element.find('div', id='vendinfo').text.strip() if telephone_element else 'N/A'

                    # Extract telephone number using regular expressions
                    telephone_number = re.search(r'Téléphones:\s+(\d+ \d+ \d+ \d+)', telephone_text)
                    telephone_number = telephone_number.group(1) if telephone_number else 'N/A'

                    price_element = real_estate_soup.find('div', id='prix')
                    price_text = price_element.text.strip() if price_element else 'N/A'

                    surface.append(surface_value)
                    description.append(description_text)
                    additional_info.append(additional_info_text)
                    telephone_numbers.append(telephone_number)
                    prices.append(price_text)
            else:
                surface_value = 'N/A'
                description_text = 'N/A'
                additional_info_text = 'N/A'
                telephone_number = 'N/A'
                price_text = 'N/A'

# Create a DataFrame from the scraped data
data = {
    'Surface': surface,
    'Description': description,
    'Additional Info': additional_info,
    'Telephone Number': telephone_numbers,
    'Price': prices
}
df12 = pd.DataFrame(data)

# Save the DataFrame to a CSV file
# df.to_csv('real_estate_data.csv', index=False, encoding='utf-8-sig')

# Print the DataFrame
df12.head(10)

Unnamed: 0,Surface,Description,Additional Info,Telephone Number,Price
0,,A vendre une villa neuf tres haut standing...,Maison - Villa,36 68 43 68,prix indéterminé
1,,فيلا في تفرغ زينة راقية و مؤثثة اثاث راقي ...,Maison - Villa,41 30 60 60,prix indéterminé
2,,شانتيه مغطيه في الفلوجه كريب من كدروه فاتح...,Maison - Villa,47 09 84 43,"700,000 MRU"
3,,السلام عليكم هذي سمعه فتن سويلم بالقرب من ...,Maison - Villa,31 06 45 08,"1,800,000 MRU"
4,,منزل في تنكد يتكون من ثلاثة بيوت و مرافق و...,Maison - Villa,48 32 02 36,"700,000 MRU"
5,,سلام عليكم و رحمة الله تعالى و بركاته عندي...,Maison - Villa,37 10 10 70,"2,300,000 MRU"
6,,السلام عليكم هذي دار اكريب من كرفور لبرار ...,Maison - Villa,31 06 45 08,"1,200,000 MRU"
7,,دار أرضية كد عباد الرحمان 2مساحتها 216السع...,Maison - Villa,31 06 45 08,"2,000,000 MRU"
8,,شانتي مغطية فلوزوه ومنقوش خاصها بينتير وكر...,Maison - Villa,31 06 45 08,"2,200,000 MRU"
9,,سمعة كريب من كدروه من فوگ مستقله صالحه للك...,Maison - Villa,46 11 30 01,"1,500,000 MRU"


In [24]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re

# Define the base URL of the real estate website
base_url = 'https://www.voursa.com/Index.cfm?PN={}&gct=3&gv=13'

# Set the number of pages to scrape (modify as per your requirement)
num_pages = 210

# Initialize empty lists to store the scraped data
surface = []
description = []
additional_info = []
telephone_numbers = []
prices = []

# Define the keywords to filter by
keywords = ['Maison', 'Bâtiment', 'Villa']

# Iterate over each page
for page in range(196, num_pages + 1):
    # Construct the URL for the current page
    url = base_url.format(page)

    # Send an HTTP GET request to the website and retrieve the HTML content
    response = requests.get(url)
    html_content = response.content

    # Create a BeautifulSoup object to parse the HTML content
    soup = BeautifulSoup(html_content, 'html.parser')

    # Find all the anchor tags containing the URLs
    anchors = soup.find_all('a', href=True)

    # Extract the URLs of the real estate listings
    for anchor in anchors:
        href = anchor['href']
        if href.startswith('/annonces.cfm?pdtid='):
            listing_url = 'https://www.voursa.com' + href

            # Send an HTTP GET request to the real estate URL and retrieve the HTML content
            real_estate_response = requests.get(listing_url)
            real_estate_html = real_estate_response.content

            # Create a BeautifulSoup object to parse the real estate HTML content
            real_estate_soup = BeautifulSoup(real_estate_html, 'html.parser')

            # Extract the surface and description from the real estate page
            details_category = real_estate_soup.find('div', id='details_category')
            if details_category:
                surface_element = details_category.find('span', string='Surface(m²):')
                surface_value = surface_element.find_next_sibling('span', class_='valued').text.strip() if surface_element else 'N/A'
                description_element = real_estate_soup.find('div', id='pdesc')
                description_text = description_element.text.strip() if description_element else 'N/A'

                # Clean the description text
                description_text = description_text.replace('Description: \r\n', '')

                additional_info_element = real_estate_soup.find('span', class_='StylH2')
                additional_info_text = additional_info_element.get_text(strip=True) if additional_info_element else 'N/A'

                # Check if additional info contains any of the keywords
                if any(keyword in additional_info_text for keyword in keywords):
                    telephone_element = real_estate_soup.find('div', id='nom')
                    telephone_text = telephone_element.find('div', id='vendinfo').text.strip() if telephone_element else 'N/A'

                    # Extract telephone number using regular expressions
                    telephone_number = re.search(r'Téléphones:\s+(\d+ \d+ \d+ \d+)', telephone_text)
                    telephone_number = telephone_number.group(1) if telephone_number else 'N/A'

                    price_element = real_estate_soup.find('div', id='prix')
                    price_text = price_element.text.strip() if price_element else 'N/A'

                    surface.append(surface_value)
                    description.append(description_text)
                    additional_info.append(additional_info_text)
                    telephone_numbers.append(telephone_number)
                    prices.append(price_text)
            else:
                surface_value = 'N/A'
                description_text = 'N/A'
                additional_info_text = 'N/A'
                telephone_number = 'N/A'
                price_text = 'N/A'

# Create a DataFrame from the scraped data
data = {
    'Surface': surface,
    'Description': description,
    'Additional Info': additional_info,
    'Telephone Number': telephone_numbers,
    'Price': prices
}
df13 = pd.DataFrame(data)

# Save the DataFrame to a CSV file
# df.to_csv('real_estate_data.csv', index=False, encoding='utf-8-sig')

# Print the DataFrame
df13.head(10)

Unnamed: 0,Surface,Description,Additional Info,Telephone Number,Price
0,,فيلا في تفرغ زينة راقية و مؤثثة اثاث راقي ...,Maison - Villa,41 30 60 60,prix indéterminé
1,,A vendre une villa neuf tres haut standing...,Maison - Villa,36 68 43 68,prix indéterminé
2,,لاتفوتكم الفرصة هذي دار ماه محوطة فلفلوجة ...,Maison - Villa,38 57 18 08,"550,000 MRU"
3,,فرصة سمعة فعرفات اهل بايعين زين قتبارك الل...,Maison - Villa,38 57 18 08,"1,200,000 MRU"
4,,Maison à vendre 150m étage en bas 2 chanbr...,Maison - Villa,43 30 08 99,"1,350,000 MRU"
5,,منزل للبيع قرب دار الحاكم توجونين بوحديدة ...,Maison - Villa,36 61 10 10,"2,200,000 MRU"
6,,سمعه فالمشروع قريبه من كدروه ول سبر شارعها...,Bâtiment,32 72 39 79,"2,700,000 MRU"
7,,فيلا سمعه فتفرغ زينه اترابها 800م .طابق ار...,Maison - Villa,32 72 39 79,prix indéterminé
8,,السلام عليكم ورحمة الله تعالي وبركاته عندي...,Maison - Villa,36 30 80 70,"6,500,000 MRU"
9,,"منزل بالقرب من كرفور سكتير 2,فاتح اعل امبد...",Maison - Villa,31 35 34 35,"580,000 MRU"


In [25]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re

# Define the base URL of the real estate website
base_url = 'https://www.voursa.com/Index.cfm?PN={}&gct=3&gv=13'

# Set the number of pages to scrape (modify as per your requirement)
num_pages = 225

# Initialize empty lists to store the scraped data
surface = []
description = []
additional_info = []
telephone_numbers = []
prices = []

# Define the keywords to filter by
keywords = ['Maison', 'Bâtiment', 'Villa']

# Iterate over each page
for page in range(211, num_pages + 1):
    # Construct the URL for the current page
    url = base_url.format(page)

    # Send an HTTP GET request to the website and retrieve the HTML content
    response = requests.get(url)
    html_content = response.content

    # Create a BeautifulSoup object to parse the HTML content
    soup = BeautifulSoup(html_content, 'html.parser')

    # Find all the anchor tags containing the URLs
    anchors = soup.find_all('a', href=True)

    # Extract the URLs of the real estate listings
    for anchor in anchors:
        href = anchor['href']
        if href.startswith('/annonces.cfm?pdtid='):
            listing_url = 'https://www.voursa.com' + href

            # Send an HTTP GET request to the real estate URL and retrieve the HTML content
            real_estate_response = requests.get(listing_url)
            real_estate_html = real_estate_response.content

            # Create a BeautifulSoup object to parse the real estate HTML content
            real_estate_soup = BeautifulSoup(real_estate_html, 'html.parser')

            # Extract the surface and description from the real estate page
            details_category = real_estate_soup.find('div', id='details_category')
            if details_category:
                surface_element = details_category.find('span', string='Surface(m²):')
                surface_value = surface_element.find_next_sibling('span', class_='valued').text.strip() if surface_element else 'N/A'
                description_element = real_estate_soup.find('div', id='pdesc')
                description_text = description_element.text.strip() if description_element else 'N/A'

                # Clean the description text
                description_text = description_text.replace('Description: \r\n', '')

                additional_info_element = real_estate_soup.find('span', class_='StylH2')
                additional_info_text = additional_info_element.get_text(strip=True) if additional_info_element else 'N/A'

                # Check if additional info contains any of the keywords
                if any(keyword in additional_info_text for keyword in keywords):
                    telephone_element = real_estate_soup.find('div', id='nom')
                    telephone_text = telephone_element.find('div', id='vendinfo').text.strip() if telephone_element else 'N/A'

                    # Extract telephone number using regular expressions
                    telephone_number = re.search(r'Téléphones:\s+(\d+ \d+ \d+ \d+)', telephone_text)
                    telephone_number = telephone_number.group(1) if telephone_number else 'N/A'

                    price_element = real_estate_soup.find('div', id='prix')
                    price_text = price_element.text.strip() if price_element else 'N/A'

                    surface.append(surface_value)
                    description.append(description_text)
                    additional_info.append(additional_info_text)
                    telephone_numbers.append(telephone_number)
                    prices.append(price_text)
            else:
                surface_value = 'N/A'
                description_text = 'N/A'
                additional_info_text = 'N/A'
                telephone_number = 'N/A'
                price_text = 'N/A'

# Create a DataFrame from the scraped data
data = {
    'Surface': surface,
    'Description': description,
    'Additional Info': additional_info,
    'Telephone Number': telephone_numbers,
    'Price': prices
}
df14 = pd.DataFrame(data)

# Save the DataFrame to a CSV file
# df.to_csv('real_estate_data.csv', index=False, encoding='utf-8-sig')

# Print the DataFrame
df14.head(10)

Unnamed: 0,Surface,Description,Additional Info,Telephone Number,Price
0,,A vendre une villa neuf tres haut standing...,Maison - Villa,36 68 43 68,prix indéterminé
1,,فيلا في تفرغ زينة راقية و مؤثثة اثاث راقي ...,Maison - Villa,41 30 60 60,prix indéterminé
2,,دار زينة ف كرفور كد مرصت صونادار,Maison - Villa,48 59 74 86,"8,000,000 MRU"
3,,تطل على كدروه وفيهاغرفتي نوم وصالون وبيتين...,Maison - Villa,22 24 11 72,"3,500,000 MRU"
4,,سمعه اكبير فيه ربع ابيوت اركاد اكبار كاملي...,Maison - Villa,26 38 77 99,"4,700,000 MRU"
5,,KossovoPrêt du goudronEt a l’angle- *En ba...,Maison - Villa,46 03 45 70,"2,200,000 MRU"
6,,يمكنكم التواصل معنا عبر الواتساب علي الرقم...,Maison - Villa,47 60 00 73,"1,200,000 MRU"
7,,ديبلكس فاصحراوي اتراب 250 م مربع ب 4700000...,Maison - Villa,33 19 16 13,"4,700,000 MRU"
8,,بيع نيمروين اكران افتفرغ زينة منطقة غنودرت...,Maison - Villa,22 40 71 40,"900,000 MRU"
9,,Description: \nبيع نيمروين اكران افتفرغ زينة م...,Maison - Villa,22 40 71 40,"900,000 MRU"


In [26]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re

# Define the base URL of the real estate website
base_url = 'https://www.voursa.com/Index.cfm?PN={}&gct=3&gv=13'

# Set the number of pages to scrape (modify as per your requirement)
num_pages = 240

# Initialize empty lists to store the scraped data
surface = []
description = []
additional_info = []
telephone_numbers = []
prices = []

# Define the keywords to filter by
keywords = ['Maison', 'Bâtiment', 'Villa']

# Iterate over each page
for page in range(226, num_pages + 1):
    # Construct the URL for the current page
    url = base_url.format(page)

    # Send an HTTP GET request to the website and retrieve the HTML content
    response = requests.get(url)
    html_content = response.content

    # Create a BeautifulSoup object to parse the HTML content
    soup = BeautifulSoup(html_content, 'html.parser')

    # Find all the anchor tags containing the URLs
    anchors = soup.find_all('a', href=True)

    # Extract the URLs of the real estate listings
    for anchor in anchors:
        href = anchor['href']
        if href.startswith('/annonces.cfm?pdtid='):
            listing_url = 'https://www.voursa.com' + href

            # Send an HTTP GET request to the real estate URL and retrieve the HTML content
            real_estate_response = requests.get(listing_url)
            real_estate_html = real_estate_response.content

            # Create a BeautifulSoup object to parse the real estate HTML content
            real_estate_soup = BeautifulSoup(real_estate_html, 'html.parser')

            # Extract the surface and description from the real estate page
            details_category = real_estate_soup.find('div', id='details_category')
            if details_category:
                surface_element = details_category.find('span', string='Surface(m²):')
                surface_value = surface_element.find_next_sibling('span', class_='valued').text.strip() if surface_element else 'N/A'
                description_element = real_estate_soup.find('div', id='pdesc')
                description_text = description_element.text.strip() if description_element else 'N/A'

                # Clean the description text
                description_text = description_text.replace('Description: \r\n', '')

                additional_info_element = real_estate_soup.find('span', class_='StylH2')
                additional_info_text = additional_info_element.get_text(strip=True) if additional_info_element else 'N/A'

                # Check if additional info contains any of the keywords
                if any(keyword in additional_info_text for keyword in keywords):
                    telephone_element = real_estate_soup.find('div', id='nom')
                    telephone_text = telephone_element.find('div', id='vendinfo').text.strip() if telephone_element else 'N/A'

                    # Extract telephone number using regular expressions
                    telephone_number = re.search(r'Téléphones:\s+(\d+ \d+ \d+ \d+)', telephone_text)
                    telephone_number = telephone_number.group(1) if telephone_number else 'N/A'

                    price_element = real_estate_soup.find('div', id='prix')
                    price_text = price_element.text.strip() if price_element else 'N/A'

                    surface.append(surface_value)
                    description.append(description_text)
                    additional_info.append(additional_info_text)
                    telephone_numbers.append(telephone_number)
                    prices.append(price_text)
            else:
                surface_value = 'N/A'
                description_text = 'N/A'
                additional_info_text = 'N/A'
                telephone_number = 'N/A'
                price_text = 'N/A'

# Create a DataFrame from the scraped data
data = {
    'Surface': surface,
    'Description': description,
    'Additional Info': additional_info,
    'Telephone Number': telephone_numbers,
    'Price': prices
}
df15 = pd.DataFrame(data)

# Save the DataFrame to a CSV file
# df.to_csv('real_estate_data.csv', index=False, encoding='utf-8-sig')

# Print the DataFrame
df15.head(10)

Unnamed: 0,Surface,Description,Additional Info,Telephone Number,Price
0,,A vendre une villa neuf tres haut standing...,Maison - Villa,36 68 43 68,prix indéterminé
1,,فيلا في تفرغ زينة راقية و مؤثثة اثاث راقي ...,Maison - Villa,41 30 60 60,prix indéterminé
2,,سمعه فعرفات حديثه البناء تقع بين كرفور الم...,Maison - Villa,42 00 42 42,prix indéterminé
3,,دار فكرفور معاه حوش فيه بيت وامبار من بتوه...,Maison - Villa,41 30 71 24,"1,200,000 MRU"
4,,بيع نيمروين اكران فغنودرت بينهم دارين مع ط...,Maison - Villa,22 40 71 40,"1,000,000 MRU"
5,,منزل من السنگ كبير فيه اثلت ابيوت ودوش وكو...,Maison - Villa,30 67 69 65,"450,000 MRU"
6,,سمعه احذ امسيد احذ كرفور الخارجي وعل كدروه...,Maison - Villa,26 38 77 99,"8,000,000 MRU"
7,,دار فملح مبيوعه اهوينه سكتير 1 حذا كرفور ا...,Maison - Villa,36 82 01 12,"500,000 MRU"
8,,دار لكصر فاتح اعل شارع10م فات اتقشت اكروات...,Maison - Villa,20 58 02 67,"1,500,000 MRU"
9,,فاتح اعل فساحة قريب من كدروه,Maison - Villa,20 58 02 67,"800,000 MRU"


In [27]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re

# Define the base URL of the real estate website
base_url = 'https://www.voursa.com/Index.cfm?PN={}&gct=3&gv=13'

# Set the number of pages to scrape (modify as per your requirement)
num_pages = 255

# Initialize empty lists to store the scraped data
surface = []
description = []
additional_info = []
telephone_numbers = []
prices = []

# Define the keywords to filter by
keywords = ['Maison', 'Bâtiment', 'Villa']

# Iterate over each page
for page in range(241, num_pages + 1):
    # Construct the URL for the current page
    url = base_url.format(page)

    # Send an HTTP GET request to the website and retrieve the HTML content
    response = requests.get(url)
    html_content = response.content

    # Create a BeautifulSoup object to parse the HTML content
    soup = BeautifulSoup(html_content, 'html.parser')

    # Find all the anchor tags containing the URLs
    anchors = soup.find_all('a', href=True)

    # Extract the URLs of the real estate listings
    for anchor in anchors:
        href = anchor['href']
        if href.startswith('/annonces.cfm?pdtid='):
            listing_url = 'https://www.voursa.com' + href

            # Send an HTTP GET request to the real estate URL and retrieve the HTML content
            real_estate_response = requests.get(listing_url)
            real_estate_html = real_estate_response.content

            # Create a BeautifulSoup object to parse the real estate HTML content
            real_estate_soup = BeautifulSoup(real_estate_html, 'html.parser')

            # Extract the surface and description from the real estate page
            details_category = real_estate_soup.find('div', id='details_category')
            if details_category:
                surface_element = details_category.find('span', string='Surface(m²):')
                surface_value = surface_element.find_next_sibling('span', class_='valued').text.strip() if surface_element else 'N/A'
                description_element = real_estate_soup.find('div', id='pdesc')
                description_text = description_element.text.strip() if description_element else 'N/A'

                # Clean the description text
                description_text = description_text.replace('Description: \r\n', '')

                additional_info_element = real_estate_soup.find('span', class_='StylH2')
                additional_info_text = additional_info_element.get_text(strip=True) if additional_info_element else 'N/A'

                # Check if additional info contains any of the keywords
                if any(keyword in additional_info_text for keyword in keywords):
                    telephone_element = real_estate_soup.find('div', id='nom')
                    telephone_text = telephone_element.find('div', id='vendinfo').text.strip() if telephone_element else 'N/A'

                    # Extract telephone number using regular expressions
                    telephone_number = re.search(r'Téléphones:\s+(\d+ \d+ \d+ \d+)', telephone_text)
                    telephone_number = telephone_number.group(1) if telephone_number else 'N/A'

                    price_element = real_estate_soup.find('div', id='prix')
                    price_text = price_element.text.strip() if price_element else 'N/A'

                    surface.append(surface_value)
                    description.append(description_text)
                    additional_info.append(additional_info_text)
                    telephone_numbers.append(telephone_number)
                    prices.append(price_text)
            else:
                surface_value = 'N/A'
                description_text = 'N/A'
                additional_info_text = 'N/A'
                telephone_number = 'N/A'
                price_text = 'N/A'

# Create a DataFrame from the scraped data
data = {
    'Surface': surface,
    'Description': description,
    'Additional Info': additional_info,
    'Telephone Number': telephone_numbers,
    'Price': prices
}
df16 = pd.DataFrame(data)

# Save the DataFrame to a CSV file
# df.to_csv('real_estate_data.csv', index=False, encoding='utf-8-sig')

# Print the DataFrame
df16.head(10)

Unnamed: 0,Surface,Description,Additional Info,Telephone Number,Price
0,,فيلا في تفرغ زينة راقية و مؤثثة اثاث راقي ...,Maison - Villa,41 30 60 60,prix indéterminé
1,,A vendre une villa neuf tres haut standing...,Maison - Villa,36 68 43 68,prix indéterminé
2,,دار وافيه خاصها الا بينتير اف سيته لمام و...,Maison - Villa,41 87 75 49,"1,200,000 MRU"
3,,دار في دار النعيم شارع اكبير و شارع ٦ ب700...,Maison - Villa,41 87 75 49,"700,000 MRU"
4,,ابارتماهات و اباتيك عل طريق الأمل فرصة للم...,Bâtiment,27 82 81 31,"5,500,000 MRU"
5,,دار في تنسويلم فيها جلسة و صالوه و بيت ارك...,Maison - Villa,27 82 81 31,"750,000 MRU"
6,,فرصة نادرة في تنسويلم فيلا أرضية مساحتها ا...,Maison - Villa,27 82 81 31,"1,200,000 MRU"
7,,دار كبيرة للبيع المساحة 600م في الكرن على ...,Maison - Villa,31 31 30 57,prix indéterminé
8,,نيمرو افواد الناقة ماه ابعيد من كدروه في ا...,Maison - Villa,41 44 77 83,"320,000 MRU"
9,,نيمرو في واد الناقة في الكرن وفاتح اعل وسع...,Maison - Villa,41 44 77 83,"340,000 MRU"


In [28]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re

# Define the base URL of the real estate website
base_url = 'https://www.voursa.com/Index.cfm?PN={}&gct=3&gv=13'

# Set the number of pages to scrape (modify as per your requirement)
num_pages = 270

# Initialize empty lists to store the scraped data
surface = []
description = []
additional_info = []
telephone_numbers = []
prices = []

# Define the keywords to filter by
keywords = ['Maison', 'Bâtiment', 'Villa']

# Iterate over each page
for page in range(256, num_pages + 1):
    # Construct the URL for the current page
    url = base_url.format(page)

    # Send an HTTP GET request to the website and retrieve the HTML content
    response = requests.get(url)
    html_content = response.content

    # Create a BeautifulSoup object to parse the HTML content
    soup = BeautifulSoup(html_content, 'html.parser')

    # Find all the anchor tags containing the URLs
    anchors = soup.find_all('a', href=True)

    # Extract the URLs of the real estate listings
    for anchor in anchors:
        href = anchor['href']
        if href.startswith('/annonces.cfm?pdtid='):
            listing_url = 'https://www.voursa.com' + href

            # Send an HTTP GET request to the real estate URL and retrieve the HTML content
            real_estate_response = requests.get(listing_url)
            real_estate_html = real_estate_response.content

            # Create a BeautifulSoup object to parse the real estate HTML content
            real_estate_soup = BeautifulSoup(real_estate_html, 'html.parser')

            # Extract the surface and description from the real estate page
            details_category = real_estate_soup.find('div', id='details_category')
            if details_category:
                surface_element = details_category.find('span', string='Surface(m²):')
                surface_value = surface_element.find_next_sibling('span', class_='valued').text.strip() if surface_element else 'N/A'
                description_element = real_estate_soup.find('div', id='pdesc')
                description_text = description_element.text.strip() if description_element else 'N/A'

                # Clean the description text
                description_text = description_text.replace('Description: \r\n', '')

                additional_info_element = real_estate_soup.find('span', class_='StylH2')
                additional_info_text = additional_info_element.get_text(strip=True) if additional_info_element else 'N/A'

                # Check if additional info contains any of the keywords
                if any(keyword in additional_info_text for keyword in keywords):
                    telephone_element = real_estate_soup.find('div', id='nom')
                    telephone_text = telephone_element.find('div', id='vendinfo').text.strip() if telephone_element else 'N/A'

                    # Extract telephone number using regular expressions
                    telephone_number = re.search(r'Téléphones:\s+(\d+ \d+ \d+ \d+)', telephone_text)
                    telephone_number = telephone_number.group(1) if telephone_number else 'N/A'

                    price_element = real_estate_soup.find('div', id='prix')
                    price_text = price_element.text.strip() if price_element else 'N/A'

                    surface.append(surface_value)
                    description.append(description_text)
                    additional_info.append(additional_info_text)
                    telephone_numbers.append(telephone_number)
                    prices.append(price_text)
            else:
                surface_value = 'N/A'
                description_text = 'N/A'
                additional_info_text = 'N/A'
                telephone_number = 'N/A'
                price_text = 'N/A'

# Create a DataFrame from the scraped data
data = {
    'Surface': surface,
    'Description': description,
    'Additional Info': additional_info,
    'Telephone Number': telephone_numbers,
    'Price': prices
}
df17 = pd.DataFrame(data)

# Save the DataFrame to a CSV file
# df.to_csv('real_estate_data.csv', index=False, encoding='utf-8-sig')

# Print the DataFrame
df17.head(10)

Unnamed: 0,Surface,Description,Additional Info,Telephone Number,Price
0,,فيلا في تفرغ زينة راقية و مؤثثة اثاث راقي ...,Maison - Villa,41 30 60 60,prix indéterminé
1,,A vendre une villa neuf tres haut standing...,Maison - Villa,36 68 43 68,prix indéterminé
2,,للجادين فقط,Bâtiment,44 04 38 70,"8,500,000 MRU"
3,,للجادين فقط اوراقها تامين,Maison - Villa,44 04 38 70,"14,000,000 MRU"
4,,الموقع عن كرفور كدروه كرفات الرسمي وكدروه ...,Maison - Villa,44 04 38 70,"42,000,000 MRU"
5,,للجاديه هذي دار جديده ومكفله تكفال زين وعل...,Maison - Villa,44 04 38 70,"580,000 MRU"
6,,أحذبقالة طين طان,Maison - Villa,47 97 86 12,"1,200,000 MRU"
7,,دار لتنسويلم عل شارع الطب قريب من كدروه ال...,Maison - Villa,41 13 71 85,prix indéterminé
8,,سلام عليكم و رحمة الله تعالى و بركاته عندي...,Maison - Villa,37 10 10 70,"3,400,000 MRU"
9,,فاتح اعل ابديه اصغير قريه من كدروه,Maison - Villa,46 23 72 32,"1,050,000 MRU"


In [29]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re

# Define the base URL of the real estate website
base_url = 'https://www.voursa.com/Index.cfm?PN={}&gct=3&gv=13'

# Set the number of pages to scrape (modify as per your requirement)
num_pages = 279

# Initialize empty lists to store the scraped data
surface = []
description = []
additional_info = []
telephone_numbers = []
prices = []

# Define the keywords to filter by
keywords = ['Maison', 'Bâtiment', 'Villa']

# Iterate over each page
for page in range(271, num_pages + 1):
    # Construct the URL for the current page
    url = base_url.format(page)

    # Send an HTTP GET request to the website and retrieve the HTML content
    response = requests.get(url)
    html_content = response.content

    # Create a BeautifulSoup object to parse the HTML content
    soup = BeautifulSoup(html_content, 'html.parser')

    # Find all the anchor tags containing the URLs
    anchors = soup.find_all('a', href=True)

    # Extract the URLs of the real estate listings
    for anchor in anchors:
        href = anchor['href']
        if href.startswith('/annonces.cfm?pdtid='):
            listing_url = 'https://www.voursa.com' + href

            # Send an HTTP GET request to the real estate URL and retrieve the HTML content
            real_estate_response = requests.get(listing_url)
            real_estate_html = real_estate_response.content

            # Create a BeautifulSoup object to parse the real estate HTML content
            real_estate_soup = BeautifulSoup(real_estate_html, 'html.parser')

            # Extract the surface and description from the real estate page
            details_category = real_estate_soup.find('div', id='details_category')
            if details_category:
                surface_element = details_category.find('span', string='Surface(m²):')
                surface_value = surface_element.find_next_sibling('span', class_='valued').text.strip() if surface_element else 'N/A'
                description_element = real_estate_soup.find('div', id='pdesc')
                description_text = description_element.text.strip() if description_element else 'N/A'

                # Clean the description text
                description_text = description_text.replace('Description: \r\n', '')

                additional_info_element = real_estate_soup.find('span', class_='StylH2')
                additional_info_text = additional_info_element.get_text(strip=True) if additional_info_element else 'N/A'

                # Check if additional info contains any of the keywords
                if any(keyword in additional_info_text for keyword in keywords):
                    telephone_element = real_estate_soup.find('div', id='nom')
                    telephone_text = telephone_element.find('div', id='vendinfo').text.strip() if telephone_element else 'N/A'

                    # Extract telephone number using regular expressions
                    telephone_number = re.search(r'Téléphones:\s+(\d+ \d+ \d+ \d+)', telephone_text)
                    telephone_number = telephone_number.group(1) if telephone_number else 'N/A'

                    price_element = real_estate_soup.find('div', id='prix')
                    price_text = price_element.text.strip() if price_element else 'N/A'

                    surface.append(surface_value)
                    description.append(description_text)
                    additional_info.append(additional_info_text)
                    telephone_numbers.append(telephone_number)
                    prices.append(price_text)
            else:
                surface_value = 'N/A'
                description_text = 'N/A'
                additional_info_text = 'N/A'
                telephone_number = 'N/A'
                price_text = 'N/A'

# Create a DataFrame from the scraped data
data = {
    'Surface': surface,
    'Description': description,
    'Additional Info': additional_info,
    'Telephone Number': telephone_numbers,
    'Price': prices
}
df18 = pd.DataFrame(data)

# Save the DataFrame to a CSV file
# df.to_csv('real_estate_data.csv', index=False, encoding='utf-8-sig')

# Print the DataFrame
df18.head(10)

Unnamed: 0,Surface,Description,Additional Info,Telephone Number,Price
0,,A vendre une villa neuf tres haut standing...,Maison - Villa,36 68 43 68,prix indéterminé
1,,فيلا في تفرغ زينة راقية و مؤثثة اثاث راقي ...,Maison - Villa,41 30 60 60,prix indéterminé
2,,فيلا للبيع في تفرغ زينة (أفنور) بالقرب من ...,Maison - Villa,22 36 64 31,"3,500,000 MRU"
3,,فيلا للبيع في تفرغ زينة الصحراوي أجديده لل...,Maison - Villa,22 36 64 31,"3,200,000 MRU"
4,,منزل في ابيكة 10 قريب من كدروه روصولمزيد م...,Maison - Villa,20 02 25 09,prix indéterminé
5,,Maison 12×13m2.arafat,Maison - Villa,27 65 98 20,prix indéterminé
6,,المنزل المذكور فيه صلوه وبيت اركاد وبيت او...,Maison - Villa,27 41 89 49,prix indéterminé
7,,الدار أجديده ماك اندخلت افكرن اوفاتح على ش...,Maison - Villa,37 30 00 58,"2,300,000 MRU"
8,,الدار عند كرفور اسطمبول مساحته 15 m اف 10m...,Maison - Villa,42 66 70 66,prix indéterminé
9,,علان عن ...,Maison - Villa,37 38 18 18,"250,000 MRU"


In [32]:
import pandas as pd

# Create a list to store all the dataframes
dfs = []

# Load the first dataframe named 'df'
df = globals()['df']  # Assuming you have already defined the 'df' dataframe
dfs.append(df)

# Iterate over the remaining dataframe names and load them into the list
for i in range(1, 19):
    df_name = 'df' + str(i)
    df = globals()[df_name]  # Assuming you have already defined the dataframes
    dfs.append(df)

# Concatenate all the dataframes into a single dataframe
merged_df = pd.concat(dfs)

# Remove duplicates and keep only unique values
unique_df = merged_df.drop_duplicates()

# Reset the index of the new dataframe
unique_df = unique_df.reset_index(drop=True)

# Save the dataframe to a CSV file with utf-8 encoding
unique_df.to_csv('voursa_real_estate_info.csv', index=False, encoding='utf-8-sig')


In [31]:
unique_df.describe()

Unnamed: 0,Surface,Description,Additional Info,Telephone Number,Price
count,5046.0,5046,5046,5046,5046
unique,1.0,4918,2,1168,326
top,,يمكنكم التواصل معنا عبر الواتساب علي الرقم...,Maison - Villa,31 06 45 08,prix indéterminé
freq,5046.0,21,4573,518,539
