# Scrap data

### import necessary libraries

In [1]:
from bs4 import BeautifulSoup, Tag
import requests
import re
import pandas as pd
from tqdm import tqdm

### scrap the first page

In [None]:
# function to extract html document from given url
def getHTMLdocument(url):
    # give a fake user agent to pass security
    headers = {
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.12; rv:55.0) Gecko/20100101 Firefox/55.0',
    }
    
    # request for HTML document of given url
    response = requests.get(url, headers=headers)
      
    # response will be provided in JSON format
    return response.text
    
# assign required credentials
# assign URL
url_to_scrape = "https://selogeraumali.com/annonces-immobilieres"
  
# create document
html_document = getHTMLdocument(url_to_scrape)
  
# create soap object
soup = BeautifulSoup(html_document, 'html.parser')

In [None]:
soup.prettify()

### get the number of pages

In [None]:
number_of_pages = 1 # there is no display of pages when there is only one page

page_numbers = soup.find_all("a", class_="page-numbers")
if page_numbers:
    page_numbers_list = list(page_numbers)
    if len(page_numbers_list) > 1:
        last_page = page_numbers_list[-2]
        number_of_pages = int(last_page.get_text())

### get the links for all pages

In [None]:
property_links = []
for page_number in tqdm(range(1, number_of_pages + 1)):
    document = getHTMLdocument(url_to_scrape + f"/page/{page_number}")
    page_soup = BeautifulSoup(html_document, 'html.parser')
    property_titles = page_soup.find_all("h2", class_="property-title")
    property_links.extend(
        [
            a["href"] for h2 in property_titles for a in h2.children
            if isinstance(a, Tag) and a.has_attr("href")
        ]
    )

In [None]:
property_links

### Iterate through every link and get the property details

In [None]:
description_text_sep = "|"
location_field_prefix = "location"
info_field_prefix = "info"
agent_field_prefix = "agent"
property_records = []

for property_link in tqdm(property_links):
    property_records.append({})
    property_records[-1]["source"] = url_to_scrape

    document = getHTMLdocument(property_link)
    property_soup = BeautifulSoup(document, 'html.parser')
    
    # find property ad title
    page_title = property_soup.find("h1")
    if page_title:
        property_records[-1]["title"] = page_title.get_text()
    
    # find property status
    property_status_div = property_soup.find("div", class_="property-status")
    if property_status_div:
        property_status = property_status_div.find("span")
        if property_status:
            property_records[-1]["status"] = property_status.get_text()
        
    # find price
    property_price_div = property_soup.find("div", class_="property-price")
    if property_price_div:
        property_price = property_price_div.find("span")
        if property_price:
            property_records[-1]["price"] = property_price.get_text()
    
    # find description
    property_description = property_soup.find_all("div", class_="property-description")
    if property_description:
        property_description = property_description[0]
        description_parts = property_description.find_all("p", recursive=True)

        property_records[-1]["description"] = description_text_sep.join(
            [part.get_text() for part in description_parts]
        )

    # find location details
    property_location = property_soup.find_all("div", class_="single-property-element property-location")
    if property_location:
        property_location = property_location[0]
        location_fields = property_location.find_all("strong", recursive=True)
        for field in location_fields:
            field_value = field.find_next_sibling("span")
            if field_value:
                property_records[-1][
                    f"{location_field_prefix}-{field.get_text()}"
                ] = field_value.get_text()
                
    # find additional info
    property_info = property_soup.find("div", id="ere-overview")
    if property_info:
        info_fields = property_info.find_all("strong", recursive=True)
        for field in info_fields:
            field_value = field.find_next_sibling("span")
            if field_value:
                property_records[-1][
                    f"{info_field_prefix}-{field.get_text()}"
                ] = field_value.get_text()
    
    
    property_info = property_soup.find("div", id="ere-features")
    if property_info:
        info_fields = property_info.find_all("a", recursive=True)
        for field in info_fields:
            property_records[-1][
                f"{info_field_prefix}-{field.get_text()}"
            ] = True
    
    # find agent info
    property_agent_heading = property_soup.find("div", class_="agent-heading")
    if property_agent_heading:
        agent_name = property_agent_heading.find("a", recursive=True)
        if agent_name:
            property_records[-1]["agent_name"] = agent_name.get_text()
        agent_type = property_agent_heading.find("span", recursive=True)
        if agent_type:
            property_records[-1]["agent_type"] = agent_type.get_text()
    
    property_agent_mobile = property_soup.find("div", class_="agent-mobile")
    if property_agent_mobile:
        agent_mobile = property_agent_mobile.find("span", recursive=True)
        if agent_mobile:
            property_records[-1]["agent_mobile"] = agent_mobile.get_text()
    
    property_agent_email = property_soup.find("div", class_="agent-email")
    if property_agent_email:
        agent_email = property_agent_email.find("span", recursive=True)
        if agent_email:
            property_records[-1]["agent_email"] = agent_mobile.get_text()
    
    # find ratings info
    property_ratings_average = property_soup.find("span", class_="ratings-average")
    if property_ratings_average:
        property_records[-1]["ratings_average"] = property_ratings_average.get_text()
    
    property_ratings_count = property_soup.find("span", class_="ratings-count")
    if property_ratings_count:
        property_records[-1]["ratings_count"] = property_ratings_count.get_text()
    
    
    property_ratings_overall_rating = property_soup.find("div", class_="overall-rating")
    if property_ratings_overall_rating:
        property_ratings_details = property_ratings_overall_rating.find_all("li")
        if property_ratings_details:
            for ratings_detail in property_ratings_details:
                ratings_detail_label = ratings_detail.find("span", class_="label")
                if ratings_detail_label:
                    ratings_detail_pct = ratings_detail_label.find_next_sibling("span", class_="label")
                    if ratings_detail_pct:
                        property_records[-1][
                            f"ratings_{ratings_detail_label.get_text()}"
                        ] = ratings_detail_pct.get_text()
    
    property_date = property_soup.find("span", class_="property-date")
    if property_date:
        property_records[-1]["property_date"] = property_date.get_text()
    
    property_reviews_ul = property_soup.find("ul", class_="reviews-list")
    if property_reviews_ul:
        property_reviews = property_reviews_ul.find_all("li")
        if property_reviews:
            print(property_link)
            print(property_reviews)

In [None]:
property_records

### Create pandas dataframe from records and save data as csv

In [None]:
property_df = pd.DataFrame.from_records(property_records)

In [None]:
property_df.to_csv("properties.csv", index=None)

# Process data

In [2]:
property_df = pd.read_csv("properties.csv")

In [3]:
property_df

Unnamed: 0,source,title,status,price,description,location-Adresse:,location-Pays:,location-Ville / Région:,location-Quartier / Cercle:,location-Voisinage:,...,info- Électricité,info-Jardin,info- Bibliothèque,info- Buffets,info- Chaines TV numérique,info- Coiffeuse,info- Cuisine entièrement équipée,info- Entièrement meublé,info-Piscine,info- piscine
0,https://selogeraumali.com/annonces-immobilieres,Villa duplex à louer à Sotuba aci,Location,Prix sur demande,A louer une villa duplex à Sotuba aci Bamako p...,Sotuba,Mali,Bamako,Sotuba Bamako,Sotuba ACI Bamako,...,,,,,,,,,,
1,https://selogeraumali.com/annonces-immobilieres,Appartement f3 meublé d’exception à louer l’ac...,Location,650.000 Fcfa,A louer : appartement f3 meublé d’exception à ...,Aci 2000,Mali,Bamako,ACI 2000 Bamako,Radisson Blu Hotel Bamako,...,,,,,,,,,,
2,https://selogeraumali.com/annonces-immobilieres,Villa à vendre à Sirakoro Cité BMS,Location,65.000.000 Fcfa / Par mois,MAISON À VENDRE A SIRAKORO CITÉ BMS.|3 CHAMBRE...,Sirakoro,Mali,Bamako,Zirakoro Bamako,,...,True,,,,,,,,,
3,https://selogeraumali.com/annonces-immobilieres,Villa à louer à Yirimadio Cité Tellem,Location,350.000 Fcfa / Par mois,A louer Villa à Yirimadio Cité Tellem pas chèr...,Yirimadio,Mali,Bamako,Yirimadio Bamako,Yirimadio Cité Tellem Bamako,...,,,,,,,,,,
4,https://selogeraumali.com/annonces-immobilieres,Villa meublée à louer à Sébénikoro,Location,1.000.000 Fcfa / Par mois,Villa meublée à louer à Sébénikoro Bamako pour...,Sébénikoro,Mali,Bamako,Sébénikoro Bamako,Dorodougou Bamako,...,,1.0,True,True,True,True,True,True,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1095,https://selogeraumali.com/annonces-immobilieres,Villa à louer à Bamako ACI 2000,Location,1.200.000 Fcfa / Par mois,Location jolie villa luxueuse à Bamako ACI 200...,ACI 2000,Mali,Bamako,ACI 2000 Bamako,Palais des sports Bamako,...,True,1.0,,,,,,,,
1096,https://selogeraumali.com/annonces-immobilieres,Villa à louer à Badalabougou,Location,600.000 Fcfa / Par mois,A louer non meublée une belle villa à Badalabo...,Badalabougou,Mali,Bamako,Badalabougou Bamako,Badalabougou SEMA Bamako,...,,1.0,,,,,,,,
1097,https://selogeraumali.com/annonces-immobilieres,Terrain 15/20 Avec Chantier En Cours à vendre ...,Vente,A partir de 5.000.000 Fcfa,A vendre terrain 15/20 avec Chantier En Cours ...,Souban Extension,Mali,Koulikoro,Souban Koulikoro,,...,,,,,,,,,,
1098,https://selogeraumali.com/annonces-immobilieres,Villa avec piscine à louer à la Cité du Niger,Location,1.700.000 Fcfa / Par mois,A louer villa luxueuse avec piscine non meublé...,Cité du Niger,Mali,Bamako,Cité du Niger Bamako,Cité du Niger au bord Fleuve Niger Bamako,...,True,1.0,,,,,,,1.0,True


# Check data quality

# Store data

# Transform data

# Visualize data