In [1]:
# import libraries
import requests
from bs4 import BeautifulSoup
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from requests import get
import numpy as np
import time
from time import sleep

#### Beautiful Soup will help us read this html. It picks the text from the response and parses the information in a way that makes it easier for us to navigate in its structure and get its contents.
#### The requests module allows you to send HTTP requests using Python

In [2]:
# Assigning the url(Uniform Resource Locator) of website from which data needs to be scarpped.
base_url = 'https://www.propertypro.ng/property-for-rent/in/lagos'
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36'}

#### Now, let's test if we can communicate with the website. You can get several codes from this command, but if we get '200' it’s usually a sign that we’re good to go.

In [3]:
#  Requesting the data  from url.
r = requests.get(base_url, headers=headers)
r.status_code

200

In [4]:
soup = BeautifulSoup(r.text, 'html.parser')
print(soup.title.get_text())

 Property & Houses for rent  in Lagos   (26,638 listings) | PropertyPro.ng


In [5]:
house_containers = soup.select('div', class_="single-room-text")

In [6]:
titles = soup.select('.single-room-text > a > h2')

In [7]:
len(titles)

22

In [8]:
# Creating empty lists in-order to append data scrapped from url's.
# The number of lists depends on the number of features we are extracting from the url.

title=[]
location=[]
price=[]
Amenities=[]
toilet=[]
bathroom=[]
bedroom=[]

In [9]:
titles_list = []
for title in titles:
    title = title.get_text()
    title = title.split()
    title = title[0:3]
    title = ' '.join(title)
    titles_list.append(title)
titles_list

['4 BEDROOM FLAT',
 '2 BEDROOM FLAT',
 '3 BEDROOM FLAT',
 '4 BEDROOM FLAT',
 '5 BEDROOM HOUSE',
 '4 BEDROOM TERRACED',
 'OFFICE SPACE COMMERCIAL',
 '4 BEDROOM TERRACED',
 '4 BEDROOM SEMI',
 '3 BEDROOM FLAT',
 '5 BEDROOM DETACHED',
 '3 BEDROOM TERRACED',
 '3 BEDROOM FLAT',
 '2 BEDROOM FLAT',
 '3 BEDROOM FLAT',
 'SEMI DETACHED DUPLEX',
 '3 BEDROOM FLAT',
 '3 BEDROOM FLAT',
 '4 BEDROOM TERRACED',
 '3 BEDROOM FLAT',
 '4 BEDROOM FLAT',
 '2 BEDROOM FLAT']

In [10]:
# find all prices
prices = soup.select('.n50 > h3 > span')

In [11]:
# fixing all prices
prices_list = []
for i, price in enumerate(prices):
    price = price.get_text()
    if len(price)> 2:
        price = price.replace(',','')
        try :
            price = int(price)
        except ValueError:
            price = str(price)    
        prices_list.append(price)

In [12]:
len(prices)

44

In [13]:
prices_list

['14999999/year',
 12000000,
 16000000,
 20000000,
 14999999,
 10000000,
 45000,
 4500000,
 4500000,
 3500000,
 110000000,
 3700000,
 2800000,
 13000000,
 10000000,
 3500000,
 9000000,
 2500000,
 8000000,
 21500000,
 18000000,
 12000000]

In [14]:
locations = soup.select('.single-room-text > h4')

In [15]:
len(locations)

22

In [16]:
# find all locations
locations_list = []
for i in locations:
    loc = i.get_text().split()
    loc = loc[-2]
    loc = loc.replace(',', '')
    locations_list.append(loc)
locations_list

['Ikoyi',
 'Ikoyi',
 'Ikoyi',
 'Ikoyi',
 'Ikoyi',
 'Island',
 'Island',
 'Lekki',
 'Lekki',
 'Lekki',
 'Lekki',
 'Lekki',
 'Lekki',
 'Ikoyi',
 'Ikoyi',
 'Lekki',
 'Ikoyi',
 'Surulere',
 'Ikoyi',
 'Island',
 'Ikoyi',
 'Island']

In [17]:
Amenities = soup.select('.fur-areea > span')

In [18]:
len(Amenities)

66

In [19]:
Amenities[3].text

'2 beds'

In [20]:
# Let's write a function to fix all the amenities into a list

toilet_list = []
bathroom_list = []
bedroom_list = []

for i, a in enumerate(Amenities):
    if i in range(2,66,3):
        a = a.get_text().split()
        toilet = a[0]
        toilet_list.append(toilet)
    elif i in range(1,66,3):
        a = a.get_text().split()
        bathroom = a[0]
        bathroom_list.append(bathroom)
    elif i in range (0,66,3):
        a = a.get_text().split()
        bedroom = a[0]
        bedroom_list.append(bedroom)

In [21]:
toilet_list

['5',
 '3',
 '4',
 '5',
 '6',
 '6',
 'Toilets',
 '5',
 '5',
 '4',
 '6',
 '4',
 '4',
 '3',
 '4',
 'Toilets',
 '4',
 '4',
 '5',
 '4',
 '5',
 '3']

In [22]:
len(toilet_list)

22

In [23]:
bathroom_list

['4',
 '2',
 '3',
 '4',
 '5',
 '5',
 'baths',
 '4',
 '4',
 '3',
 '5',
 '3',
 '3',
 '2',
 '3',
 'baths',
 '3',
 '4',
 '4',
 '3',
 '4',
 '2']

In [24]:
len(bathroom_list)

22

In [25]:
bedroom_list

['4',
 '2',
 '3',
 '4',
 '5',
 '4',
 'beds',
 '4',
 '4',
 '3',
 '5',
 '3',
 '3',
 '2',
 '3',
 'beds',
 '3',
 '3',
 '4',
 '3',
 '4',
 '2']

In [26]:
len(bedroom_list)

22

In [27]:
#creating a function to be able to return all list generated

def scrape_page(url):
    req = requests.get (base_url, headers=headers)
    soup = BeautifulSoup(r.text,'html.parser')
    
    titles = soup.select('.single-room-text > a > h2')
    titles_list = []
    for title in titles:
        title = title.get_text()
    title = title.split()
    title = title[0:3]
    title = ' '.join(title)
    titles_list.append(title)

    prices = soup.select('.n50 > h3 > span')
    prices_list = []
    for i, price in enumerate(prices):
        price = price.get_text()
    if len(price)> 2:
        price = price.replace(',','')
        try :
            price = int(price)
        except ValueError:
            price = str(price)    
        prices_list.append(price)

    locations = soup.select('.single-room-text > h4')
    locations_list = []
    for i in locations:
        loc = i.get_text().split()
    loc = loc[-2]
    loc = loc.replace(',', '')
    locations_list.append(loc)

    Amenities = soup.select('.fur-areea > span')
    toilet_list = []
    bathroom_list = []
    bedroom_list = []

    for i, a in enumerate(Amenities):
        if i in range(2,66,3):
            a = a.get_text().split()
            toilet = a[0]
            toilet_list.append(toilet)
        elif i in range(1,66,3):
            a = a.get_text().split()
            bathroom = a[0]
            bathroom_list.append(bathroom)
        elif i in range (0,66,3):
            a = a.get_text().split()
            bedroom = a[0]
            bedroom_list.append(bedroom)
    
    return(titles_list, prices_list, locations_list, toilet_list, bathroom_list, bedroom_list)

In [28]:
base_url = 'https://www.propertypro.ng/property-for-rent/in/lagos?page=1'
urls= []
for i in range(1,400):
    url = base_url + '?page=' + str(i)
    urls.append(url)

In [29]:
urls[4]

'https://www.propertypro.ng/property-for-rent/in/lagos?page=1?page=5'

In [30]:
master_titles = []
master_prices = []
master_locations = []
master_toilet = []
master_bathroom = []
master_bedroom = []

for url in urls:
    titles_list, prices_list, locations_list, toilet_list, bathroom_list, bedroom_list = scrape_page(url)
    
    master_titles += titles_list
    master_prices += prices_list
    master_locations += locations_list
    master_toilet += toilet_list
    master_bathroom += bathroom_list
    master_bedroom += bedroom_list

In [34]:
len(master_titles)

399

In [32]:
len(master_prices)

399

In [49]:
len(master_locations)

430

In [50]:
len(master_toilet)

9460

In [51]:
len(master_bathroom)

9460

In [52]:
len(master_bedroom)

9460

In [21]:
# Defining a dataframe.
eazy_rent = pd.DataFrame()

In [22]:
# Returns data in form a dataframe with columns of specified names containing the assigned values from the list.
df['title']=title 
df['location']=loc
df['price']=price
df['Amenities_list']=Amenities_list

In [24]:
property=df.to_csv('eazyrent.csv')

In [26]:
# Reading data from CSV file.
property1=pd.read_csv('eazyrent.csv')

In [27]:
property1.shape

(66, 5)