In [1]:
!pip install requests

Defaulting to user installation because normal site-packages is not writeable


In [2]:
!pip install bs4

Defaulting to user installation because normal site-packages is not writeable


In [52]:
import csv
import os
import time
import requests
import numpy as np
from bs4 import BeautifulSoup

In [4]:
url =' https://www.buyrentkenya.com/houses-for-sale'
# set up the webscrapping agent for chrome
agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36'
# headers dictionary to send with the GET request
HEADERS = {'User-Agent': agent, 'Accept-Language': 'en-US, en'}

In [5]:
# get the data
data = requests.get(url, headers=HEADERS)

In [6]:
# creating a soup object
soup = BeautifulSoup(data.text, 'html.parser')

In [7]:
# Finding houses data in the div
# because class is a reserved keyword in Python
results = soup.find_all('div', class_='listing_card')

In [8]:
# lists to store data in
titles = []
prices = []
descriptions = []
locations = []
no_of_bedrooms = []
no_of_bathrooms = []


In [9]:
# find house details
houses = soup.find_all('div', class_='listing-card')

In [10]:
# loop through first page houses
for page in range(1,188):

    for house in houses:
        title = house.find('span', class_='relative top-[2px] hidden md:inline').text.strip()
        price = house.find('p', class_='text-xl font-bold leading-7 text-grey-900').text.strip()
        description = house.find('a',class_='block truncate text-grey-500 no-underline').text.strip()
        location = house.find('p', class_='ml-1 truncate text-sm font-normal capitalize text-grey-650').text.strip()
        no_of_bedroom = house.find_all('span', class_='whitespace-nowrap font-normal')[0].text.strip()
        no_of_bathroom = house.find_all('span', class_='whitespace-nowrap font-normal')[1].text.strip()

        titles.append(title)
        locations.append(location)
        descriptions.append(description)
        prices.append(price)
        no_of_bedrooms.append(no_of_bedroom)
        no_of_bathrooms.append(no_of_bathroom)
        

In [11]:
# check whether all the data was scraped successfully
len(titles) == len(locations) == len(descriptions) == len(prices) == len(no_of_bedrooms) == len(no_of_bathrooms)

True

In [12]:
!pip install pandas

Defaulting to user installation because normal site-packages is not writeable


In [13]:
# create a dataset and store the scraped data
import pandas as pd
df = pd.DataFrame(
    {'Title': titles,
     'Location': locations,
     'Description': descriptions,
     'Price': prices,
     'No_of_bedrooms': no_of_bedrooms,
     'No_of_bathrooms': no_of_bathrooms}
)

In [14]:
df.head()

Unnamed: 0,Title,Location,Description,Price,No_of_bedrooms,No_of_bathrooms
0,4 Bed Townhouse with En Suite at Mount Kenya Road,"Mount Kenya road, Nyali Area, Nyali",Beautiful 4 bedroom three level townhouse onsa...,"KSh 65,000,000",4,5
1,4 Bed Townhouse at Phase 2,"Phase 2, Buruburu",Upgraded 4 Bedroom House With SQ,"KSh 13,000,000",4,2
2,4 Bed House with En Suite at Kitisuru,"kitisuru, Kitisuru, Westlands","Welcome to Zada residence, 4 bedroom","KSh 45,000,000",4,4
3,6 Bed Townhouse with En Suite in Lavington,Lavington,Modern 6 Bedroom Townhouse for Sale in Lavingt...,"KSh 150,000,000",6,6
4,4 Bed Townhouse with En Suite in Loresho,"Loresho, Westlands",4 Bedrooms Townhouse,"KSh 145,000,000",4,5


In [15]:
df.shape

(3366, 6)

In [16]:
df.to_csv('rents.csv', index=False, encoding='utf-8')

In [17]:
# find duplicates
df_clean = df.drop_duplicates()

In [18]:
df_clean.shape

(18, 6)

In [19]:
df.shape

(3366, 6)

### Data Cleaning

In [20]:
# Duplicates(find and drop)

In [21]:
# Find nulls
df.isna().sum()

Title              0
Location           0
Description        0
Price              0
No_of_bedrooms     0
No_of_bathrooms    0
dtype: int64

In [22]:
# Remove Ksh prefix, commas and convert to int
# df['Price'] = df['Price'].astype('string')
df['Price'] = pd.to_numeric(df['Price'].str.replace('KSh', '').str.replace('Ksh', '').str.replace(',', '').str.strip())

In [23]:
df['No_of_bathrooms'] = pd.to_numeric(df['No_of_bathrooms'])
df['No_of_bedrooms'] = pd.to_numeric(df['No_of_bedrooms'])

In [24]:
# Finding null values in no. of bathrooms column
df['No_of_bathrooms'].isna().sum()


0

In [25]:
# Replace NA in No of bathrooms with 0
df['No_of_bathrooms'] = df['No_of_bathrooms'].fillna(0)

In [26]:
# Convert type to int
df['No_of_bathrooms'] = df['No_of_bathrooms'].astype(int)

In [27]:
# find data types
df.dtypes

Title              object
Location           object
Description        object
Price               int64
No_of_bedrooms      int64
No_of_bathrooms     int32
dtype: object

In [28]:
df_clean = df

In [29]:
df['Location'] = df['Location'].str.split(',').str[-1].str.strip()

In [30]:
df.head()

Unnamed: 0,Title,Location,Description,Price,No_of_bedrooms,No_of_bathrooms
0,4 Bed Townhouse with En Suite at Mount Kenya Road,Nyali,Beautiful 4 bedroom three level townhouse onsa...,65000000,4,5
1,4 Bed Townhouse at Phase 2,Buruburu,Upgraded 4 Bedroom House With SQ,13000000,4,2
2,4 Bed House with En Suite at Kitisuru,Westlands,"Welcome to Zada residence, 4 bedroom",45000000,4,4
3,6 Bed Townhouse with En Suite in Lavington,Lavington,Modern 6 Bedroom Townhouse for Sale in Lavingt...,150000000,6,6
4,4 Bed Townhouse with En Suite in Loresho,Westlands,4 Bedrooms Townhouse,145000000,4,5


In [31]:
# df.duplicated().sum()

3348

### EDA

In [32]:
print(df['No_of_bedrooms'].corr(df['Price']))

0.41707945903682764


In [33]:
print(df['Price'].corr(df['No_of_bathrooms']))

0.4920303821513709


In [34]:
print(df['No_of_bedrooms'].corr(df['No_of_bathrooms']))

0.7064865262788574


### Linear regression

In [47]:
pip install scikit-learn

ERROR: Could not install packages due to an OSError: [WinError 32] The process cannot access the file because it is being used by another process: 'C:\\Users\\ADMIN\\AppData\\Local\\Packages\\PythonSoftwareFoundation.Python.3.12_qbz5n2kfra8p0\\LocalCache\\local-packages\\Python312\\site-packages\\scipy\\stats\\_ksstats.py'
Check the permissions.



Defaulting to user installation because normal site-packages is not writeable
Collecting scikit-learn
  Downloading scikit_learn-1.5.0-cp312-cp312-win_amd64.whl.metadata (11 kB)
Collecting scipy>=1.6.0 (from scikit-learn)
  Downloading scipy-1.13.1-cp312-cp312-win_amd64.whl.metadata (60 kB)
     ---------------------------------------- 0.0/60.6 kB ? eta -:--:--
     ---------------------------------------- 0.0/60.6 kB ? eta -:--:--
     ---------------------------------------- 0.0/60.6 kB ? eta -:--:--
     ---------------------------------------- 0.0/60.6 kB ? eta -:--:--
     ---------------------------------------- 0.0/60.6 kB ? eta -:--:--
     ---------------------------------------- 0.0/60.6 kB ? eta -:--:--
     ---------------------------------------- 0.0/60.6 kB ? eta -:--:--
     ---------------------------------------- 0.0/60.6 kB ? eta -:--:--
     ---------------------------------------- 0.0/60.6 kB ? eta -:--:--
     ---------------------------------------- 0.0/60.6 kB ? 

In [48]:
# import linear_model from scikit-learn
from sklearn.linear_model import LinearRegression

In [49]:
# create LR Object using the constructor
lm = LinearRegression()

In [56]:
# define predictor and target variables
x = df[['No_of_bedrooms']]
y = df[['Price']]
z = df[['No_of_bedrooms', 'No_of_bathrooms']]

In [67]:
# fit the model - get the parameters of intercept(b0) and slope(bi)
lm.fit(x, y)

In [68]:
lm.intercept_

array([-1.77627411e+08])

In [69]:
lm.coef_

array([[64321827.41116714]])

In [70]:
# obtaining predictions
Yhat = lm.predict(x)

In [71]:
Yhat

array([[7.96598985e+07],
       [7.96598985e+07],
       [7.96598985e+07],
       ...,
       [7.96598985e+07],
       [1.43981726e+08],
       [1.53380711e+07]])

In [72]:
r_sq = lm.score(x,y)
print(r_sq)

0.17395527515045273
