
## Process to obtain the data:
1. Get the number of posts.
    * define city name.
    * create url with get_url function
    * requests.get(url) to get response
    * use BeautifuSoup to get entire posts text
    * Find total number of posts then return 
2. Use the total number of posts to get each pages first post number (119 posts per page)
    * List name pages e.g. [0, 120, 240, 360, ........]
3. Scrape the information I need and store in the list
    * loop each pages list, it opens the page starting with the posts number
    * Create function to obtain informations.
        This will scrape each post, then store the information. 
4. Creat DataFrame with the information I scraped.
    * Store in the list of empty DataFrames 
    * Save as csv file


#### Sub_city List:
* sub_city_van_area: 
        * 'van'- Vancouver
        * 'bnc' - Burnaby/NewWest
        * 'rds' - Delta/Surrey/Langley
        * 'nvn' - North Shore
        * 'rch' - Richmond
        * 'pml' - Tricities/Pittmedow/Maple Ridge
* sub_city_tor_area: 
        * 'tor' - City of Toronto
        * 'bra' - Brampton
        * 'drh' - Durham Region
        * 'mss' - Mississauga
        * 'oak' - Oakville
        * 'yrk' - York Region

In [1]:
from requests import get
from bs4 import BeautifulSoup
from time import sleep
import numpy as np
import pandas as pd

def get_search_number(url, city, sub_city):
    response = get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    posts = soup.find_all('li', class_='result-row')
    results_num = soup.find('div', class_='search-legend')
    results_total = int(results_num.find('span', class_='totalcount').text)
    print(f"Total number of search result in {city} - {sub_city}: {results_total}")
    return results_total

In [2]:
class Url:
    def __init__(self, city_name, sub_city=None, num_post=0):
        self.city_name = city_name
        self.sub_city = sub_city
        self.num_post = num_post
        self.url = self.get_url()
        
    def get_url(self):
        if self.city_name == 'Abbotsford':
            url = f'https://{self.city_name.lower()}.craigslist.org/search/apa?s={self.num_post}sort=date&bundleDuplicates=1&min_price=&max_price=&availabilityMode=0&sale_date=all+dates'
        else:
            url = f'https://{self.city_name.lower()}.craigslist.org/search/{self.sub_city}/apa?s={self.num_post}sort=date&bundleDuplicates=1&min_price=&max_price=&availabilityMode=0&sale_date=all+dates'
        return url   

In [3]:
%%time
city_names = ["Vancouver", "Toronto", "Abbotsford"]
dfs = []
for city in city_names:
    if city == 'Vancouver':
        sub_city_names = ['van', 'bnc', 'rds', 'nvn', 'rch', 'pml']
    elif city == 'Toronto':
        sub_city_names = ['tor', 'bra', 'drh', 'mss', 'oak', 'yrk']
    elif city == "Abbotsford":
        sub_city_names = ['fv']
    for sub_city in sub_city_names:
        url = Url(city, sub_city)
        url_ = url.url
        results_total = get_search_number(url_, city, sub_city)
        
        pages = np.arange(0, results_total+1, 120)
        
        iterations = 0

        post_date = []
        post_cities = []
        num_bedroom = []
        sqfts = []
        prices = []
        post_titles = []
        post_links = []

        for page in pages:
     
            # get request
            url = Url(city_name=city, sub_city=sub_city, num_post=page)
            url_ = url.url
            response = get(url_)
    
            sleep(1)
    
            # throw warning for status code that are not 200
            if response.status_code != 200:
                warn(f'Request: {requests}; Status code: {response.status_code}')
        
            soup = BeautifulSoup(response.text, 'html.parser')
    
            posts = soup.find_all('li', class_='result-row')
    
    
            # Extract data 
            for post in posts:
        
                if post.find('span', class_ = 'result-hood') is not None:
            
                    # Posting date
                    post_datetime = post.find('time', class_='result-date')['datetime']
                    post_date.append(post_datetime)
            
                    # Neighbourhoods
                    post_city = post.find('span', class_='result-hood').text.strip('( )')
                    post_cities.append(post_city)
            
                    # title text
                    post_title = post.find('a', class_='result-title hdrlnk')
                    post_title_text = post_title.text
                    post_titles.append(post_title_text)
            
                    # Post price in integer
                    price = int(post.find('span', class_='result-price').text.split('$')[1].replace(',', ''))
                    prices.append(price)
            
                    # Post link
                    post_link = post_title['href']
                    post_links.append(post_link)
            
                    if post.find('span', class_ = 'housing') is not None:
                
                        #if the first element is accidentally square footage
                        if 'ft2' in post.find('span', class_ = 'housing').text.split()[0]:
                    
                            #make bedroom nan
                            bedroom_count = np.nan
                            num_bedroom.append(bedroom_count)
                    
                            #make sqft the first element
                            sqft = int(post.find('span', class_ = 'housing').text.split()[0][:-3])
                            sqfts.append(sqft)
                    
                        #if the length of the housing details element is more than 2
                        elif len(post.find('span', class_ = 'housing').text.split()) > 2:
                    
                            #therefore element 0 will be bedroom count
                            bedroom_count = post.find('span', class_ = 'housing').text.replace("br", "").split()[0]
                            num_bedroom.append(bedroom_count)
                    
                            #and sqft will be number 3, so set these here and append
                            sqft = int(post.find('span', class_ = 'housing').text.split()[2][:-3])
                            sqfts.append(sqft)
                    
                        #if there is num bedrooms but no sqft
                        elif len(post.find('span', class_ = 'housing').text.split()) == 2:
                    
                            #therefore element 0 will be bedroom count
                            bedroom_count = post.find('span', class_ = 'housing').text.replace("br", "").split()[0]
                            num_bedroom.append(bedroom_count)
                    
                            #and sqft will be number 3, so set these here and append
                            sqft = np.nan
                            sqfts.append(sqft)                    
                
                        else:
                            bedroom_count = np.nan
                            num_bedroom.append(bedroom_count)
                
                            sqft = np.nan
                            sqfts.append(sqft)
                
                    #if none of those conditions catch, make bedroom nan, this won't be needed    
                    else:
                        bedroom_count = np.nan
                        num_bedroom.append(bedroom_count)
                
                        sqft = np.nan
                        sqfts.append(sqft)
                
                iterations += 1
        print(f"Total Number of post scraped: {iterations}")
        
        # Store in Pandas DataFrame then append to the list of DataFrame
        df = pd.DataFrame({'Post Datetime': post_date,
                           "City Code": city.upper()[:3],
                           'Area Code': sub_city.upper(),
                           'Post Title': post_titles,
                           'Post URL': post_links,
                          'Neighborhood': post_cities,
                          'Bedroom': num_bedroom,
                          'SQFT': sqfts,
                          'Price': prices})
        
        dfs.append(df)
        
        # Save DataFrame in CSV file. 
        df.to_csv(f'C:\\Users\\Masa\\Desktop\\Chilliwack Real Estate\\Data\\rental_price_data.csv',
                 header=False, mode='a')

Total number of search result in Vancouver - van: 3000
Total Number of post scraped: 3056
Total number of search result in Vancouver - bnc: 1205
Total Number of post scraped: 1237
Total number of search result in Vancouver - rds: 1728
Total Number of post scraped: 1759
Total number of search result in Vancouver - nvn: 722
Total Number of post scraped: 751
Total number of search result in Vancouver - rch: 335
Total Number of post scraped: 340
Total number of search result in Vancouver - pml: 832
Total Number of post scraped: 838
Total number of search result in Toronto - tor: 3000
Total Number of post scraped: 3039
Total number of search result in Toronto - bra: 60
Total Number of post scraped: 60
Total number of search result in Toronto - drh: 49
Total Number of post scraped: 49
Total number of search result in Toronto - mss: 254
Total Number of post scraped: 254
Total number of search result in Toronto - oak: 16
Total Number of post scraped: 61
Total number of search result in Toronto

In [4]:
len(dfs)

13

In [5]:
for df in dfs:
    print(len(df))

3056
1237
1759
751
340
838
3039
60
49
254
37
96
1344


In [6]:
# As you can tell the number of scraped data is exceeding the number of the post, so I remove duplicate. 
df = pd.read_csv('C:\\Users\\Masa\\Desktop\\Chilliwack Real Estate\\Data\\rental_price_data.csv',
                ).drop('Unnamed: 0', axis=1)
df.sort_values(by="Post Datetime", ascending=False, inplace=True)
df.drop_duplicates(subset="Post Title", inplace=True)
len(df)

14993

In [7]:
df[df["City Code"] == "VAN"]

Unnamed: 0,Post Datetime,City Code,Area Code,Post Title,Post URL,Neighborhood,Bedroom,SQFT,Price
161408,2022-12-22 21:03,VAN,RDS,2 Bed + 2 Bath Condo in Harvard Gardens - Sout...,https://vancouver.craigslist.org/rds/apa/d/sur...,Surrey,2.0,850.0,2500
161409,2022-12-22 21:03,VAN,RDS,*Brand New* 2 Bed + 1 Bath Rental Suite in Sun...,https://vancouver.craigslist.org/rds/apa/d/del...,Delta,2.0,800.0,2300
160171,2022-12-22 21:02,VAN,BNC,$2300 / 2br - 850ft2 - 2 beds 1 bath basement ...,https://vancouver.craigslist.org/bnc/apa/d/new...,NEW WESTMINSTER,2.0,850.0,2300
157116,2022-12-22 21:01,VAN,VAN,NEW BUILDING - Pet Friendly Studio@1170 Barcla...,https://vancouver.craigslist.org/van/apa/d/van...,Vancouver,,400.0,2775
157115,2022-12-22 21:01,VAN,VAN,2 bedroom 1 bathroom basement for rent unfurni...,https://vancouver.craigslist.org/van/apa/d/van...,Vancouver,2.0,900.0,2100
...,...,...,...,...,...,...,...,...,...
8152,2022-10-28 13:49,VAN,PML,One Bedroom Basement with Fantastic View,https://vancouver.craigslist.org/pml/apa/d/map...,Maple Ridge,1.0,870.0,1800
8153,2022-10-28 13:24,VAN,PML,storage,https://vancouver.craigslist.org/pml/apa/d/sur...,Port Coquitlam,,,1000
6158,2022-10-28 13:05,VAN,RDS,One Bedroom Suite - Furnished,https://vancouver.craigslist.org/rds/apa/d/sur...,Surrey,1.0,800.0,1400
6945,2022-10-28 10:35,VAN,NVN,Beautiful 3 Br 2 bathroom,https://vancouver.craigslist.org/nvn/apa/d/nor...,North Vancouver,3.0,1184.0,3895


In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 14993 entries, 168514 to 59481
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Post Datetime  14993 non-null  object 
 1   City Code      14993 non-null  object 
 2   Area Code      14993 non-null  object 
 3   Post Title     14993 non-null  object 
 4   Post URL       14993 non-null  object 
 5   Neighborhood   14993 non-null  object 
 6   Bedroom        14516 non-null  float64
 7   SQFT           10592 non-null  float64
 8   Price          14993 non-null  int64  
dtypes: float64(2), int64(1), object(6)
memory usage: 1.1+ MB


In [9]:
df.loc[(df["City Code"] == "VAN") & (df["Area Code"] == "VAN")]

Unnamed: 0,Post Datetime,City Code,Area Code,Post Title,Post URL,Neighborhood,Bedroom,SQFT,Price
157116,2022-12-22 21:01,VAN,VAN,NEW BUILDING - Pet Friendly Studio@1170 Barcla...,https://vancouver.craigslist.org/van/apa/d/van...,Vancouver,,400.0,2775
157115,2022-12-22 21:01,VAN,VAN,2 bedroom 1 bathroom basement for rent unfurni...,https://vancouver.craigslist.org/van/apa/d/van...,Vancouver,2.0,900.0,2100
157118,2022-12-22 20:57,VAN,VAN,Bright and spacious 2 Bedroom Ground Floor Eas...,https://vancouver.craigslist.org/van/apa/d/van...,East Vancouver Killarney,2.0,750.0,1950
157119,2022-12-22 20:52,VAN,VAN,FULLY FURNISHED Luxury 1 BR and Flex Room TELU...,https://vancouver.craigslist.org/van/apa/d/van...,777 Richards St and Robson St,1.0,610.0,2888
157120,2022-12-22 20:37,VAN,VAN,NEW BUILDING-Available NOW -Pet Friendly One B...,https://vancouver.craigslist.org/van/apa/d/van...,Vancouver,1.0,865.0,2990
...,...,...,...,...,...,...,...,...,...
3043,2022-11-22 22:25,VAN,VAN,(Conference Plaza) 2 Bed + 2 Bath with Solarium,https://vancouver.craigslist.org/van/apa/d/van...,"438 Seymour Street, Vancouver",2.0,721.0,3500
3044,2022-11-22 22:16,VAN,VAN,One basement studio in Dunbar area for rent,https://vancouver.craigslist.org/van/apa/d/van...,Vancouver,1.0,400.0,1700
3046,2022-11-22 19:42,VAN,VAN,"Renovated 1 BED-RM SUITE + BATH, Urban LOCATI...",https://vancouver.craigslist.org/van/apa/d/van...,"Vancouver, BC",1.0,550.0,1825
3047,2022-11-22 19:40,VAN,VAN,furnished one bedroom Garden level suite with ...,https://vancouver.craigslist.org/van/apa/d/wes...,west vancouver (cualfield village,1.0,,1550
