In [1]:
#Importing libraries

import requests
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup

In [2]:
#Initiating the web scraper for the url

url = "https://web.archive.org/web/20210805194237/https://losangeles.craigslist.org/d/apartments-housing-for-rent/search/apa"
page = requests.get(url)
parser = BeautifulSoup(page.content, "html.parser")

In [3]:
#Getting all the rental price data into a dataframe

price = pd.DataFrame(parser.find_all(class_="result-price"), columns=["Rent"])

In [4]:
#There are duplicates for each rental price

price.head(10)

Unnamed: 0,Rent
0,"$2,199"
1,"$2,199"
2,"$6,315"
3,"$6,315"
4,"$4,695"
5,"$4,695"
6,"$1,615"
7,"$1,615"
8,"$1,695"
9,"$1,695"


In [5]:
#Removing duplicates

price = price[1::2].reset_index(drop=True)

In [6]:
#Rental prices with duplicates removed

price.head(10)

Unnamed: 0,Rent
0,"$2,199"
1,"$6,315"
2,"$4,695"
3,"$1,615"
4,"$1,695"
5,"$2,095"
6,"$1,922"
7,"$2,350"
8,"$1,325"
9,"$1,495"


In [7]:
#The data type of the elements is string

type(price.iloc[0,0])

str

In [8]:
#Converting the data type to float

price["Rent"] = price["Rent"].str.replace("$", "")
price["Rent"] = price["Rent"].str.replace(",", "")
price = price.astype(float)

In [9]:
#Rental prices as float

price.head(10)

Unnamed: 0,Rent
0,2199.0
1,6315.0
2,4695.0
3,1615.0
4,1695.0
5,2095.0
6,1922.0
7,2350.0
8,1325.0
9,1495.0


In [10]:
#Getting all the property size data into a dataframe

size = pd.DataFrame(parser.find_all(class_="housing"), columns=["Size"], dtype=object)

  values = np.array([convert(v) for v in values])


In [11]:
#The data shows bedrooms and square feet in HTML format

size.head(10)

Unnamed: 0,Size
0,[\n 2br -\n ...
1,[\n 2br -\n ...
2,[\n 1br -\n ...
3,"[\n 575ft, [2], -\n ..."
4,[\n 1br -\n ]
5,[\n 1br -\n ...
6,"[\n 489ft, [2], -\n ..."
7,[\n 2br -\n ]
8,"[\n 500ft, [2], -\n ..."
9,"[\n 550ft, [2], -\n ..."


In [12]:
#The data type of the elements is a HTML element tag

type(size.iloc[0,0])

bs4.element.Tag

In [13]:
#Converting the data type to string

size = size.astype("str")

In [14]:
#Example of data point with both bedrooms and square feet

size.iloc[0, 0]

'<span class="housing">\n                    2br -\n                    1000ft<sup>2</sup> -\n                </span>'

In [15]:
#Example of data point with only square feet

size.iloc[3, 0]

'<span class="housing">\n                    575ft<sup>2</sup> -\n                </span>'

In [16]:
#Example of data point with only bedrooms

size.iloc[4, 0]

'<span class="housing">\n                    1br -\n                </span>'

In [17]:
#Taking out unnecessary strings in the data

size["Size"] = size["Size"].str.replace('<span class="housing">\n                    ', '')
size["Size"] = size["Size"].str.replace('<sup>2</sup> -\n                </span>', '')
size["Size"] = size["Size"].str.replace(' -\n                </span>', '')

In [18]:
#Bedrooms and square feet without unnecessary strings

size.head(10)

Unnamed: 0,Size
0,2br -\n 1000ft
1,2br -\n 893ft
2,1br -\n 1075ft
3,575ft
4,1br
5,1br -\n 650ft
6,489ft
7,2br
8,500ft
9,550ft


In [19]:
#Splitting the bedrooms and square feet into list elements

size["Size"] = size["Size"].str.split(' -\n                    ')

In [20]:
#Bedrooms and square feet as list elements

size.head(10)

Unnamed: 0,Size
0,"[2br, 1000ft]"
1,"[2br, 893ft]"
2,"[1br, 1075ft]"
3,[575ft]
4,[1br]
5,"[1br, 650ft]"
6,[489ft]
7,[2br]
8,[500ft]
9,[550ft]


In [21]:
#Creating empty dataframes for separating bedrooms and square feet

beds = pd.DataFrame(index=range(len(size)),columns=["Bedrooms"], dtype="str")
sq_ft = pd.DataFrame(index=range(len(size)),columns=["Square Feet"], dtype="str")

In [22]:
#Putting bedrooms and square feet data into separate dataframes

for i in range(len(size)):
    if len(size["Size"][i]) == 2:
        beds["Bedrooms"][i] = size["Size"][i][0]
        sq_ft["Square Feet"][i] = size["Size"][i][1]
    elif "br" in size["Size"][i][0]:
        beds["Bedrooms"][i] = size["Size"][i][0]
        sq_ft["Square Feet"][i] = np.nan
    elif "ft" in size["Size"][i][0]:
        beds["Bedrooms"][i] = np.nan
        sq_ft["Square Feet"][i] = size["Size"][i][0]
    i += 1

In [23]:
#Bedrooms data

beds.head(10)

Unnamed: 0,Bedrooms
0,2br
1,2br
2,1br
3,
4,1br
5,1br
6,
7,2br
8,
9,


In [24]:
#Square feet data

sq_ft.head(10)

Unnamed: 0,Square Feet
0,1000ft
1,893ft
2,1075ft
3,575ft
4,
5,650ft
6,489ft
7,
8,500ft
9,550ft


In [25]:
#Converting the bedrooms and square feet data type into float

beds["Bedrooms"] = beds["Bedrooms"].str.replace("br", "")
sq_ft["Square Feet"] = sq_ft["Square Feet"].str.replace("ft", "")
beds = beds.astype(float)
sq_ft = sq_ft.astype(float)

In [26]:
#Bedrooms as float

beds.head(10)

Unnamed: 0,Bedrooms
0,2.0
1,2.0
2,1.0
3,
4,1.0
5,1.0
6,
7,2.0
8,
9,


In [27]:
#Square feet as float

sq_ft.head(10)

Unnamed: 0,Square Feet
0,1000.0
1,893.0
2,1075.0
3,575.0
4,
5,650.0
6,489.0
7,
8,500.0
9,550.0


In [28]:
#Combining the rent, bedrooms, and square feet into one dataframe

dataset = pd.concat([price, beds, sq_ft], axis=1)

In [29]:
# Combined data

dataset.head(10)

Unnamed: 0,Rent,Bedrooms,Square Feet
0,2199.0,2.0,1000.0
1,6315.0,2.0,893.0
2,4695.0,1.0,1075.0
3,1615.0,,575.0
4,1695.0,1.0,
5,2095.0,1.0,650.0
6,1922.0,,489.0
7,2350.0,2.0,
8,1325.0,,500.0
9,1495.0,,550.0
