In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np

## Scrapping the first page

In [2]:
url = "https://www.bamako-immobilier.com/Location_Appartements.html"
page = requests.get(url)
soup = BeautifulSoup(page.text, "html.parser")

# Main pages: get the main pages that contains the list of ads/annonces.
# when novigating we click on "next page". This code grab all the pages.
t = soup.find(id = "searchbar", class_= "row")
hrefs = t.find_all(href = True)
m_pages = [x.get("href") for x in hrefs]
for i in range(len(m_pages)):
    m_pages[i] = "https://www.bamako-immobilier.com/" + m_pages[i]

m_first_page = "https://www.bamako-immobilier.com/index.php?cur_page=0&&sortby=Prix&sorttype=ASC&type=Location&bien=Appartements&action=searchresults"
m_pages.append(m_first_page)

## Scrapping - page containing list of Ads/annonces

In [3]:
#  from the main pages: get each annonce page title & link
def annonces_titles_links(main_pages):
    # main_pages : is a list type
    
    all_titles = []
    all_links = []
    
    for i in range(len(main_pages)):
        url = main_pages[i]
        page = requests.get(url)
        soup = BeautifulSoup(page.text, "html.parser")

        # annonces titles
        annonce = soup.find_all(class_= "line mt1")
        a_titles = [a.get_text() for a in annonce]
        if len(a_titles) > 0:
            for x in a_titles:
                all_titles.append(x)
    
        # annonces links
        href_tags = soup.find_all(class_="icon-rate")
        a_links = [x.get("href") for x in href_tags]
        if len(a_links) > 0:
            for i in range(len(a_links)):
                a_links[i] = "https://www.bamako-immobilier.com//" + a_links[i]
                all_links.append(a_links[i])

    data = {"title": all_titles, "link": all_links}
    df = pd.DataFrame(data)
    return df

## Scrapping - Main page of Ad/annonce

In [55]:
# from annonce page: get details (sub_title, description, prix, nombre de chambres, nombre de pieces)
def annonce_details(annonce_links):
    # annonce_links: is a list type
    
    all_sub_titles = []
    all_desc = []
    all_villes = []
    all_prix = []
    all_num_chambres = []
    all_num_pieces = []
    all_loyer_mensuel = []
    
    for i in range(len(annonce_links)):
        url = annonce_links[i]
        page = requests.get(url)
        soup = BeautifulSoup(page.text, "html.parser")
        init = soup.find(class_= "zonegris mt2")

        # grab annonce sub_title
        t = init.h2
        sub = [x.get_text() for x in t][1]
        all_sub_titles.append(sub)
        
        # annonce description
        annonce = init.find(class_= "row mt1")
        desc = [x.get_text(strip= True) for x in annonce]
        for x in desc:
            if len(x) > 0:
                x = x.replace("Descriptif: ", "")
                all_desc.append(x)
    
        # grab more infos about annonce
        temp_list = []
        m_info = init.find(class_= "row grid2")
        for x in m_info:
            t = x.get_text(strip=True, separator='\n').splitlines()
            if len(t) > 0:
                temp_list = temp_list + t
        
        # Get ville
        if 'Ville' not in temp_list:
            all_villes.append(np.nan)
        else:
            for i in range(len(temp_list)):
                if temp_list[i] == 'Ville':
                    all_villes.append(temp_list[i+1].replace(": ", ""))  

        # Get Nombre de chambres
        if 'Nombre de chambres' not in temp_list:
            all_num_chambres.append(np.nan)
        else:
            for i in range(len(temp_list)):
                if temp_list[i] == 'Nombre de chambres':
                    all_num_chambres.append(temp_list[i+1].replace(":", ""))  

        # Get Nombre de pieces
        if 'Nombre de pièces' not in temp_list:
            all_num_pieces.append(np.nan)
        else:
            for i in range(len(temp_list)):
                if temp_list[i] == 'Nombre de pièces':
                    all_num_pieces.append(temp_list[i+1].replace(":", ""))  

        # Get loyer mensuel
        if 'Loyer mensuel' not in temp_list:
            all_loyer_mensuel.append(np.nan)
        else:
            for i in range(len(temp_list)):
                if temp_list[i] == 'Loyer mensuel':
                    all_loyer_mensuel.append(temp_list[i+1].replace(":", ""))  

        # Get prix
        if 'Prix' not in temp_list:
            all_prix.append(np.nan)
        else:
            for i in range(len(temp_list)):
                if temp_list[i] == 'Prix':
                    if temp_list[i+1] == " " or temp_list[i+1] == "":
                        all_prix.append(np.nan)
                    else:
                        all_prix.append(temp_list[i+1].replace(":", ""))

    # recap
    data = {
        "sub_title" : all_sub_titles,
        "description": all_desc,
        "ville": all_villes,
        "prix": all_prix,
        "num_chambres": all_num_chambres,
        "num_pieces": all_num_pieces,
        "loyer_mensuel": all_loyer_mensuel,
        "link": annonce_links
    }
    df = pd.DataFrame(data)

    return df 


In [58]:
# Get all the data & save it as a csv file
def scrape_all(main_pages):
    main_links = annonces_titles_links(main_pages)["link"]
    df = annonce_details(main_links)
    df.to_csv("scrapping_bko_immo.csv")

scrape_all(m_pages)