# Sudo code:
### Process to obtain the data:
1. Get the number of posts.
    * define city name.
    * create url with get_url function
    * requests.get(url) to get response
    * use BeautifuSoup to get entire posts text
    * Find total number of posts then return 
2. Use the total number of posts to get each pages first post number (119 posts per page)
    * List name pages e.g. [0, 120, 240, 360, ........]
3. Scrape the information I need and store in the list
    * loop each pages list, it opens the page starting with the posts number
    * Create function to obtain informations.
        This will scrape each post, then store the information. 
4. Creat DataFrame with the information I scraped.
    * Store in the list of empty DataFrames 
    * Save as csv file


#### Sub_city List:
* sub_city_van_area: 
        * 'van'- Vancouver
        * 'bnc' - Burnaby/NewWest
        * 'rds' - Delta/Surrey/Langley
        * 'nvn' - North Shore
        * 'rch' - Richmond
        * 'pml' - Tricities/Pittmedow/Maple Ridge
* sub_city_tor_area: 
        * 'tor' - City of Toronto
        * 'bra' - Brampton
        * 'drh' - Durham Region
        * 'mss' - Mississauga
        * 'oak' - Oakville
        * 'yrk' - York Region

In [2]:
from requests import get
from bs4 import BeautifulSoup
from time import sleep
import numpy as np
import pandas as pd

def get_search_number(url, city, sub_city):
    response = get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    posts = soup.find_all('li', class_='result-row')
    results_num = soup.find('div', class_='search-legend')
    results_total = int(results_num.find('span', class_='totalcount').text)
    print(f"Total number of search result in {city} - {sub_city}: {results_total}")
    return results_total

In [3]:
class Url:
    def __init__(self, city_name, sub_city=None, num_post=0):
        self.city_name = city_name
        self.sub_city = sub_city
        self.num_post = num_post
        self.url = self.get_url()
        
    def get_url(self):
        if self.city_name == 'Abbotsford':
            url = f'https://{self.city_name.lower()}.craigslist.org/search/apa?s={self.num_post}sort=date&bundleDuplicates=1&min_price=99max_price=&availabilityMode=0&sale_date=all+dates'
        else:
            url = f'https://{self.city_name.lower()}.craigslist.org/search/{self.sub_city}/apa?s={self.num_post}sort=date&bundleDuplicates=1&min_price=99max_price=&availabilityMode=0&sale_date=all+dates'
        return url   

In [22]:
%%time
city_names = ["Vancouver", "Toronto", "Abbotsford"]
dfs = []
for city in city_names:
    if city == 'Vancouver':
        sub_city_names = ['van', 'bnc', 'rds', 'nvn', 'rch', 'pml']
    elif city == 'Toronto':
        sub_city_names = ['tor', 'bra', 'drh', 'mss', 'oak', 'yrk']
    elif city == "Abbotsford":
        sub_city_names = ['fv']
    for sub_city in sub_city_names:
        url = Url(city, sub_city)
        url_ = url.url
        results_total = get_search_number(url_, city, sub_city)
        
        pages = np.arange(0, results_total+1, 120)
        
        iterations = 0

        post_date = []
        post_cities = []
        num_bedroom = []
        sqfts = []
        prices = []
        post_titles = []
        post_links = []

        for page in pages:
     
            # get request
            url = Url(city_name=city, sub_city=sub_city, num_post=page)
            url_ = url.url
            response = get(url_)
    
            sleep(1)
    
            # throw warning for status code that are not 200
            if response.status_code != 200:
                warn(f'Request: {requests}; Status code: {response.status_code}')
        
            soup = BeautifulSoup(response.text, 'html.parser')
    
            posts = soup.find_all('li', class_='result-row')
    
    
            # Extract data 
            for post in posts:
        
                if post.find('span', class_ = 'result-hood') is not None:
            
                    # Posting date
                    post_datetime = post.find('time', class_='result-date')['datetime']
                    post_date.append(post_datetime)
            
                    # Neighbourhoods
                    post_city = post.find('span', class_='result-hood').text.strip('( )')
                    post_cities.append(post_city)
            
                    # title text
                    post_title = post.find('a', class_='result-title hdrlnk')
                    post_title_text = post_title.text
                    post_titles.append(post_title_text)
            
                    # Post price in integer
                    price = int(post.find('span', class_='result-price').text.split('$')[1].replace(',', ''))
                    prices.append(price)
            
                    # Post link
                    post_link = post_title['href']
                    post_links.append(post_link)
            
                    if post.find('span', class_ = 'housing') is not None:
                
                        #if the first element is accidentally square footage
                        if 'ft2' in post.find('span', class_ = 'housing').text.split()[0]:
                    
                            #make bedroom nan
                            bedroom_count = np.nan
                            num_bedroom.append(bedroom_count)
                    
                            #make sqft the first element
                            sqft = int(post.find('span', class_ = 'housing').text.split()[0][:-3])
                            sqfts.append(sqft)
                    
                        #if the length of the housing details element is more than 2
                        elif len(post.find('span', class_ = 'housing').text.split()) > 2:
                    
                            #therefore element 0 will be bedroom count
                            bedroom_count = post.find('span', class_ = 'housing').text.replace("br", "").split()[0]
                            num_bedroom.append(bedroom_count)
                    
                            #and sqft will be number 3, so set these here and append
                            sqft = int(post.find('span', class_ = 'housing').text.split()[2][:-3])
                            sqfts.append(sqft)
                    
                        #if there is num bedrooms but no sqft
                        elif len(post.find('span', class_ = 'housing').text.split()) == 2:
                    
                            #therefore element 0 will be bedroom count
                            bedroom_count = post.find('span', class_ = 'housing').text.replace("br", "").split()[0]
                            num_bedroom.append(bedroom_count)
                    
                            #and sqft will be number 3, so set these here and append
                            sqft = np.nan
                            sqfts.append(sqft)                    
                
                        else:
                            bedroom_count = np.nan
                            num_bedroom.append(bedroom_count)
                
                            sqft = np.nan
                            sqfts.append(sqft)
                
                    #if none of those conditions catch, make bedroom nan, this won't be needed    
                    else:
                        bedroom_count = np.nan
                        num_bedroom.append(bedroom_count)
                
                        sqft = np.nan
                        sqfts.append(sqft)
                
                iterations += 1
        print(f"Total Number of post scraped: {iterations}")
        
        # Store in Pandas DataFrame then append to the list of DataFrame
        df = pd.DataFrame({'Post Datetime': post_date,
                           "City Code": city.upper()[:3],
                           'Area Code': sub_city.upper(),
                           'Post Title': post_titles,
                           'Post URL': post_links,
                          'Neighborhood': post_cities,
                          'Bedroom': num_bedroom,
                          'SQFT': sqfts,
                          'Price': prices})
        
        dfs.append(df)
        
        # Save DataFrame in CSV file. 
        df.to_csv(f'C:\\Users\\Masa\\Desktop\\Chilliwack Real Estate\\Data\\rental_price_data.csv',
                 header=False, mode='a')

Total number of search result in Vancouver - van: 3000
Total Number of post scraped: 3050
Total number of search result in Vancouver - bnc: 1286
Total Number of post scraped: 1310
Total number of search result in Vancouver - rds: 1764
Total Number of post scraped: 1799
Total number of search result in Vancouver - nvn: 750
Total Number of post scraped: 788
Total number of search result in Vancouver - rch: 358
Total Number of post scraped: 363
Total number of search result in Vancouver - pml: 839
Total Number of post scraped: 845
Total number of search result in Toronto - tor: 3000
Total Number of post scraped: 3029
Total number of search result in Toronto - bra: 54
Total Number of post scraped: 54
Total number of search result in Toronto - drh: 52
Total Number of post scraped: 52
Total number of search result in Toronto - mss: 233
Total Number of post scraped: 233
Total number of search result in Toronto - oak: 9
Total Number of post scraped: 61
Total number of search result in Toronto 

In [23]:
len(dfs)

13

In [24]:
for df in dfs:
    print(len(df))

3050
1310
1799
788
363
845
3029
54
52
233
23
105
1439


In [25]:
# As you can tell the number of scraped data is exceeding the number of the post, so I remove duplicate. 
df = pd.read_csv('C:\\Users\\Masa\\Desktop\\Chilliwack Real Estate\\Data\\rental_price_data.csv',
                index_col='Post Datetime').drop('Unnamed: 0', axis=1)
df.drop_duplicates(inplace=True)
len(df)

12874

In [26]:
df

Unnamed: 0_level_0,City Code,Area Code,Post Title,Post URL,Neighborhood,Bedroom,SQFT,Price
Post Datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2022-12-12 7:28,VAN,VAN,"2525 Birch st , one bedroom",https://vancouver.craigslist.org/van/apa/d/van...,Vancouver,1.0,600.0,2000
2022-12-12 7:10,VAN,VAN,Quality Custom built Spacious Family Home desi...,https://vancouver.craigslist.org/van/apa/d/van...,Neighborhood Vancouver West - UBC,5.0,4681.0,7999
2022-12-12 7:07,VAN,VAN,Sensational Mountain & Ocean view updated Luxu...,https://vancouver.craigslist.org/van/apa/d/van...,Vancouver – W Point Grey,5.0,4262.0,12000
2022-12-12 7:07,VAN,VAN,Like New Spacious 1bd+1den Luxurious Condo!,https://vancouver.craigslist.org/van/apa/d/van...,Vancouver – Cambie,1.0,600.0,2599
2022-12-12 7:06,VAN,VAN,Large Two Bedroom Apartment Near VGH,https://vancouver.craigslist.org/van/apa/d/van...,Vancouver,2.0,,2295
...,...,...,...,...,...,...,...,...
2022-10-28 12:55,ABB,FV,"Spacious, comfortable, private suburban family...",https://abbotsford.craigslist.org/apa/d/langle...,Langley BC,4.0,2300.0,3950
2022-10-28 12:11,ABB,FV,Langley Condo 1BR BRAND NEW,https://abbotsford.craigslist.org/apa/d/langle...,Langley,1.0,,2000
2022-10-28 11:23,ABB,FV,Whole house for rent,https://abbotsford.craigslist.org/apa/d/prince...,Maple Ridge,3.0,1500.0,4000
2022-10-28 10:50,ABB,FV,Large 3 Bedroom - Abbotsford,https://abbotsford.craigslist.org/apa/d/abbots...,Abbotsford,3.0,1500.0,1600


In [27]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 12874 entries, 2022-12-12 7:28 to 2022-10-28 6:18
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   City Code     12874 non-null  object 
 1   Area Code     12874 non-null  object 
 2   Post Title    12874 non-null  object 
 3   Post URL      12874 non-null  object 
 4   Neighborhood  12874 non-null  object 
 5   Bedroom       12501 non-null  float64
 6   SQFT          8970 non-null   float64
 7   Price         12874 non-null  int64  
dtypes: float64(2), int64(1), object(5)
memory usage: 905.2+ KB


In [29]:
# Show Abbotsford DataFrame
dfs[6]

Unnamed: 0,Post Datetime,City Code,Area Code,Post Title,Post URL,Neighborhood,Bedroom,SQFT,Price
0,2022-12-12 10:37,TOR,TOR,C5832363 Rare Find Beautiful Spacious 35 Lower...,https://toronto.craigslist.org/tor/apa/d/downt...,Toronto (Waterfront,2,1200.0,3600
1,2022-12-12 10:37,TOR,TOR,C5834752 Rise With The Morning Sun! 180 Mill St,https://toronto.craigslist.org/tor/apa/d/downt...,Toronto (Waterfront,3,1100.0,3750
2,2022-12-12 10:37,TOR,TOR,C5840685 Stunning Corner Unit Beautiful River...,https://toronto.craigslist.org/tor/apa/d/downt...,Toronto (Waterfront,2,922.0,3800
3,2022-12-12 10:37,TOR,TOR,"C5835488 Brand New, Never Lived In, Spacious 3...",https://toronto.craigslist.org/tor/apa/d/downt...,Toronto (Moss Park,3,1100.0,4000
4,2022-12-12 10:37,TOR,TOR,"E5838385 Fabulous 2-Storey, Penthouse Suite ...",https://toronto.craigslist.org/tor/apa/d/east-...,Toronto (Riverdale,2,1100.0,5000
...,...,...,...,...,...,...,...,...,...
3024,2022-11-03 03:17,TOR,TOR,20 Blue Jays Way 215 Toronto Ontario,https://toronto.craigslist.org/tor/apa/d/downt...,city of toronto,2,,3550
3025,2022-11-03 03:17,TOR,TOR,28 Freeland St Toronto Ontario,https://toronto.craigslist.org/tor/apa/d/downt...,city of toronto,1,,3500
3026,2022-11-03 03:17,TOR,TOR,21 Iceboat Terr 2709 Toronto Ontario,https://toronto.craigslist.org/tor/apa/d/downt...,city of toronto,2,,3500
3027,2022-11-03 00:32,TOR,TOR,ENTERTAINMENT DISTRICT FEELS BRAND NEW PARKING...,https://toronto.craigslist.org/tor/apa/d/downt...,99 John St,2,,3900
