# Sudo code:
### Process to obtain the data:
1. Get the number of posts.
    * define city name.
    * create url with get_url function
    * requests.get(url) to get response
    * use BeautifuSoup to get entire posts text
    * Find total number of posts then return 
2. Use the total number of posts to get each pages first post number (119 posts per page)
    * List name pages e.g. [0, 120, 240, 360, ........]
3. Scrape the information I need and store in the list
    * loop each pages list, it opens the page starting with the posts number
    * Create function to obtain informations.
        This will scrape each post, then store the information. 
4. Creat DataFrame with the information I scraped.
    * Store in the list of empty DataFrames 
    * Save as csv file


#### Sub_city List:
* sub_city_van_area: 
        * 'van'- Vancouver
        * 'bnc' - Burnaby/NewWest
        * 'rds' - Delta/Surrey/Langley
        * 'nvn' - North Shore
        * 'rch' - Richmond
        * 'pml' - Tricities/Pittmedow/Maple Ridge
* sub_city_tor_area: 
        * 'tor' - City of Toronto
        * 'bra' - Brampton
        * 'drh' - Durham Region
        * 'mss' - Mississauga
        * 'oak' - Oakville
        * 'yrk' - York Region

In [1]:
from requests import get
from bs4 import BeautifulSoup
from time import sleep
import numpy as np
import pandas as pd

def get_search_number(url, city, sub_city):
    response = get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    posts = soup.find_all('li', class_='result-row')
    results_num = soup.find('div', class_='search-legend')
    results_total = int(results_num.find('span', class_='totalcount').text)
    print(f"Total number of search result in {city} - {sub_city}: {results_total}")
    return results_total

In [2]:
class Url:
    def __init__(self, city_name, sub_city=None, num_post=0):
        self.city_name = city_name
        self.sub_city = sub_city
        self.num_post = num_post
        self.url = self.get_url()
        
    def get_url(self):
        if self.city_name == 'Abbotsford':
            url = f'https://{self.city_name.lower()}.craigslist.org/search/apa?s={self.num_post}sort=date&bundleDuplicates=1&min_price=99max_price=&availabilityMode=0&sale_date=all+dates'
        else:
            url = f'https://{self.city_name.lower()}.craigslist.org/search/{self.sub_city}/apa?s={self.num_post}sort=date&bundleDuplicates=1&min_price=99max_price=&availabilityMode=0&sale_date=all+dates'
        return url   

In [13]:
%%time
city_names = ["Vancouver", "Toronto", "Abbotsford"]
dfs = []
for city in city_names:
    if city == 'Vancouver':
        sub_city_names = ['van', 'bnc', 'rds', 'nvn', 'rch', 'pml']
    elif city == 'Toronto':
        sub_city_names = ['tor', 'bra', 'drh', 'mss', 'oak', 'yrk']
    elif city == "Abbotsford":
        sub_city_names = ['fv']
    for sub_city in sub_city_names:
        url = Url(city, sub_city)
        url_ = url.url
        results_total = get_search_number(url_, city, sub_city)
        
        pages = np.arange(0, results_total+1, 120)
        
        iterations = 0

        post_date = []
        post_cities = []
        num_bedroom = []
        sqfts = []
        prices = []
        post_titles = []
        post_links = []

        for page in pages:
     
            # get request
            url = Url(city_name=city, sub_city=sub_city, num_post=page)
            url_ = url.url
            response = get(url_)
    
            sleep(1)
    
            # throw warning for status code that are not 200
            if response.status_code != 200:
                warn(f'Request: {requests}; Status code: {response.status_code}')
        
            soup = BeautifulSoup(response.text, 'html.parser')
    
            posts = soup.find_all('li', class_='result-row')
    
    
            # Extract data 
            for post in posts:
        
                if post.find('span', class_ = 'result-hood') is not None:
            
                    # Posting date
                    post_datetime = post.find('time', class_='result-date')['datetime']
                    post_date.append(post_datetime)
            
                    # Neighbourhoods
                    post_city = post.find('span', class_='result-hood').text.strip('( )')
                    post_cities.append(post_city)
            
                    # title text
                    post_title = post.find('a', class_='result-title hdrlnk')
                    post_title_text = post_title.text
                    post_titles.append(post_title_text)
            
                    # Post price in integer
                    price = int(post.find('span', class_='result-price').text.split('$')[1].replace(',', ''))
                    prices.append(price)
            
                    # Post link
                    post_link = post_title['href']
                    post_links.append(post_link)
            
                    if post.find('span', class_ = 'housing') is not None:
                
                        #if the first element is accidentally square footage
                        if 'ft2' in post.find('span', class_ = 'housing').text.split()[0]:
                    
                            #make bedroom nan
                            bedroom_count = np.nan
                            num_bedroom.append(bedroom_count)
                    
                            #make sqft the first element
                            sqft = int(post.find('span', class_ = 'housing').text.split()[0][:-3])
                            sqfts.append(sqft)
                    
                        #if the length of the housing details element is more than 2
                        elif len(post.find('span', class_ = 'housing').text.split()) > 2:
                    
                            #therefore element 0 will be bedroom count
                            bedroom_count = post.find('span', class_ = 'housing').text.replace("br", "").split()[0]
                            num_bedroom.append(bedroom_count)
                    
                            #and sqft will be number 3, so set these here and append
                            sqft = int(post.find('span', class_ = 'housing').text.split()[2][:-3])
                            sqfts.append(sqft)
                    
                        #if there is num bedrooms but no sqft
                        elif len(post.find('span', class_ = 'housing').text.split()) == 2:
                    
                            #therefore element 0 will be bedroom count
                            bedroom_count = post.find('span', class_ = 'housing').text.replace("br", "").split()[0]
                            num_bedroom.append(bedroom_count)
                    
                            #and sqft will be number 3, so set these here and append
                            sqft = np.nan
                            sqfts.append(sqft)                    
                
                        else:
                            bedroom_count = np.nan
                            num_bedroom.append(bedroom_count)
                
                            sqft = np.nan
                            sqfts.append(sqft)
                
                    #if none of those conditions catch, make bedroom nan, this won't be needed    
                    else:
                        bedroom_count = np.nan
                        num_bedroom.append(bedroom_count)
                
                        sqft = np.nan
                        sqfts.append(sqft)
                
                iterations += 1
        print(f"Total Number of post scraped: {iterations}")
        
        # Store in Pandas DataFrame then append to the list of DataFrame
        df = pd.DataFrame({'Post Datetime': post_date,
                           'Post Title': post_titles,
                           'Post URL': post_links,
                          'Neighborhood': post_cities,
                          'Bedroom': num_bedroom,
                          'SQFT': sqfts,
                          'Price': prices})
        
        dfs.append(df)
        
        # Save DataFrame in CSV file. 
        df.to_csv(f'C:\\Users\\Masa\\Desktop\\Chilliwack Real Estate\\Data\\{city}_{sub_city}_rental_price_data.csv',
                 header=False, mode='a')

Total number of search result in Vancouver - van: 3000
Total Number of post scraped: 3047
Total number of search result in Vancouver - bnc: 1300
Total Number of post scraped: 1324
Total number of search result in Vancouver - rds: 1798
Total Number of post scraped: 1832
Total number of search result in Vancouver - nvn: 749
Total Number of post scraped: 782
Total number of search result in Vancouver - rch: 363
Total Number of post scraped: 371
Total number of search result in Vancouver - pml: 840
Total Number of post scraped: 844
Total number of search result in Toronto - tor: 3000
Total Number of post scraped: 3032
Total number of search result in Toronto - bra: 54
Total Number of post scraped: 54
Total number of search result in Toronto - drh: 43
Total Number of post scraped: 43
Total number of search result in Toronto - mss: 234
Total Number of post scraped: 234
Total number of search result in Toronto - oak: 9
Total Number of post scraped: 61
Total number of search result in Toronto 

In [14]:
len(dfs)

13

In [15]:
for df in dfs:
    print(len(df))

3047
1324
1832
782
371
844
3032
54
43
234
32
108
1449


In [27]:
# As you can tell the number of scraped data is exceeding the number of the post, so I remove duplicate. 
df = pd.read_csv('C:\\Users\\Masa\\Desktop\\Chilliwack Real Estate\\Data\\Vancouver_van_rental_price_data.csv',
                index_col='Post Datetime').drop('Unnamed: 0', axis=1)
df.drop_duplicates(inplace=True)
len(df)

3069

In [25]:
df.head()

Unnamed: 0_level_0,Post Title,Post URL,Neighborhood,Bedroom,SQFT,Price
Post Datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2022-12-10 17:26,Three bedroom concrete condo in Vancouver West,https://vancouver.craigslist.org/van/apa/d/van...,7638 Cambie Street,3.0,1100.0,4300
2022-12-10 17:26,Furnished 1 bedroom condo (2 beds w/ 1 spare b...,https://vancouver.craigslist.org/van/apa/d/van...,UBC campus,1.0,750.0,2800
2022-12-10 17:22,Furnished All Inclusive-Parking-Solarium-1 GB ...,https://vancouver.craigslist.org/van/apa/d/van...,"Coal Harbor, Vancouver BC",1.0,588.0,3950
2022-12-10 17:14,"1 bed 1 bath at Keefer Block, pet friendly!",https://vancouver.craigslist.org/van/apa/d/van...,city of vancouver,,,2300
2022-12-10 17:13,1 BEDROOM SUITE IN CHARACTER HOUSE,https://vancouver.craigslist.org/van/apa/d/van...,Commercial Drive Area,1.0,550.0,1650


In [22]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3133 entries, 0 to 6093
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Post Datetime  3133 non-null   object 
 1   Post Title     3133 non-null   object 
 2   Post URL       3133 non-null   object 
 3   Neighborhood   3133 non-null   object 
 4   Bedroom        2928 non-null   float64
 5   SQFT           2757 non-null   float64
 6   Price          3133 non-null   int64  
dtypes: float64(2), int64(1), object(4)
memory usage: 195.8+ KB


In [23]:
# Show Abbotsford DataFrame
dfs[12]

Unnamed: 0,Post Datetime,Post Title,Post URL,Neighborhood,Bedroom,SQFT,Price
0,2022-12-11 09:36,Suite for rent,https://abbotsford.craigslist.org/apa/d/langle...,"Brookswood, Langley",2,,1850
1,2022-12-11 09:30,Beautiful 3 Bedroom Condo for Rent,https://abbotsford.craigslist.org/apa/d/chilli...,Chilliwack,3,1322.0,2250
2,2022-12-11 09:05,2.5 bathroom home in Matsqui Village,https://abbotsford.craigslist.org/apa/d/abbots...,Abbotsford,3,2200.0,3200
3,2022-12-11 09:05,2 bedroom suite,https://abbotsford.craigslist.org/apa/d/gerlac...,Chilliwack,1,,1500
4,2022-12-11 09:05,4-5 bedroom house,https://abbotsford.craigslist.org/apa/d/abbots...,abbotsford,4,,2500
...,...,...,...,...,...,...,...
1444,2022-10-27 10:17,Brand New 2BR + 2BA Luxury Condo,https://abbotsford.craigslist.org/apa/d/langle...,Willoughby,2,948.0,2550
1445,2022-10-27 10:11,2 Bedroom Apartment,https://abbotsford.craigslist.org/apa/d/langle...,Langley,2,,2200
1446,2022-10-27 09:44,Spacious 2 Bedroom Basement Suite Fleetwood,https://abbotsford.craigslist.org/apa/d/surrey...,Surrey,2,950.0,1500
1447,2022-10-27 06:29,4 bedroom home with big lot for parking,https://abbotsford.craigslist.org/apa/d/chilli...,Chilliwack,2,1750.0,2200
