In [1]:
import pandas as pd
from bs4 import BeautifulSoup
import requests
import numpy as np
import re

In [3]:
city = 'Philadelphia'
state = 'PA'
overhead = 'https://www.remax.com/realestatehomesforsale'
url = 'https://www.remax.com/realestatehomesforsale/philadelphia-pa-p002.html?query=philadelphia,pa-search/newest-sortorder'

In [4]:
def get_webpage(city, state, pg_num):
    city = city.strip().lower()
    state = state.strip().lower()
    url = f'{overhead}/{city}-{state}-p{pg_num}.html?query={city},{state}-search/newest-sortorder'
    return url

In [5]:
get_webpage('philadelphia', 'PA', 1)

'https://www.remax.com/realestatehomesforsale/philadelphia-pa-p1.html?query=philadelphia,pa-search/newest-sortorder'

In [27]:
def get_apt_urls_per_page(city, state, pg_num):
    webpage = get_webpage(city, state, pg_num)
    response = requests.get(webpage)
    results = response.content
    apt_urls = []
    
    if not response.status_code == 404:
        soup = BeautifulSoup(results, 'lxml')
        apt_sub_tags = soup.find_all('div', class_='listing-pane-details')
        
        for apt_tag in apt_sub_tags:
            apt_link_tag = apt_tag.find('a', class_='js-detaillink')
            url = apt_link_tag['href']
            apt_urls.append(url)
        
    return apt_urls

In [28]:
get_apt_urls_per_page('philadelphia', 'PA', 1)

['/realestatehomesforsale/1417-n-8th-st-philadelphia-pa-19122-id342778675.html',
 '/realestatehomesforsale/767-n-24th-st-philadelphia-pa-19130-gid400025159539.html',
 '/realestatehomesforsale/2211-moore-st-philadelphia-pa-19145-id342779562.html',
 '/realestatehomesforsale/763-n-bucknell-st-philadelphia-pa-19130-gid400025239517.html',
 '/realestatehomesforsale/1121-lemon-st-philadelphia-pa-19123-gid400030275230.html',
 '/realestatehomesforsale/1438-s-8th-st-philadelphia-pa-19147-gid400025277879.html',
 '/realestatehomesforsale/2519-2521-n-front-st-philadelphia-pa-19133-id342779336.html',
 '/realestatehomesforsale/1606-s-11th-st-philadelphia-pa-19148-id342779667.html',
 '/realestatehomesforsale/2017-s-6th-st-philadelphia-pa-19148-gid400025375118.html',
 '/realestatehomesforsale/1532-n-7th-st-no-2-philadelphia-pa-19122-id342779177.html',
 '/realestatehomesforsale/1234-s-7th-st-philadelphia-pa-19147-gid400025200829.html',
 '/realestatehomesforsale/1222-emily-st-philadelphia-pa-19148-gid400

In [62]:
def get_ensemble_apt_urls(city, state, verbose=False):
    test_page = get_webpage(city, state, 1)
    response = requests.get(test_page)
    results = response.content
    apt_ensemble_urls = []
    
    if not response.status_code == 404:
        soup = BeautifulSoup(results, 'lxml')
        pg_lst = soup.find_all('li', class_='pages-item')
        try:
            max_pg_tag = pg_lst[-1].find('a', class_='js-pager-item pages-link')
            max_pg = int(max_pg_tag.get_text())
            if verbose:
                print(f'there are {max_pg} apartment URLs to be collected')
        except:
            max_pg = np.nan
        
        if not max_pg == np.nan:
            for pg_num in range(1, max_pg+1):
                apt_ensemble_urls += get_apt_urls_per_page(city, state, pg_num)
                if verbose:
                    print(f'page {pg_num} apartment URLs collected')
        if verbose:
            print(f'all apartment URLs collected')
    return apt_ensemble_urls

In [63]:
all_apt_urls = get_ensemble_apt_urls('philadelphia', 'pa', verbose=True)

there are 394 apartment URLs to be collected
page 1 apartment URLs collected
page 2 apartment URLs collected
page 3 apartment URLs collected
page 4 apartment URLs collected
page 5 apartment URLs collected
page 6 apartment URLs collected
page 7 apartment URLs collected
page 8 apartment URLs collected
page 9 apartment URLs collected
page 10 apartment URLs collected
page 11 apartment URLs collected
page 12 apartment URLs collected
page 13 apartment URLs collected
page 14 apartment URLs collected
page 15 apartment URLs collected
page 16 apartment URLs collected
page 17 apartment URLs collected
page 18 apartment URLs collected
page 19 apartment URLs collected
page 20 apartment URLs collected
page 21 apartment URLs collected
page 22 apartment URLs collected
page 23 apartment URLs collected
page 24 apartment URLs collected
page 25 apartment URLs collected
page 26 apartment URLs collected
page 27 apartment URLs collected
page 28 apartment URLs collected
page 29 apartment URLs collected
page 30

page 244 apartment URLs collected
page 245 apartment URLs collected
page 246 apartment URLs collected
page 247 apartment URLs collected
page 248 apartment URLs collected
page 249 apartment URLs collected
page 250 apartment URLs collected
page 251 apartment URLs collected
page 252 apartment URLs collected
page 253 apartment URLs collected
page 254 apartment URLs collected
page 255 apartment URLs collected
page 256 apartment URLs collected
page 257 apartment URLs collected
page 258 apartment URLs collected
page 259 apartment URLs collected
page 260 apartment URLs collected
page 261 apartment URLs collected
page 262 apartment URLs collected
page 263 apartment URLs collected
page 264 apartment URLs collected
page 265 apartment URLs collected
page 266 apartment URLs collected
page 267 apartment URLs collected
page 268 apartment URLs collected
page 269 apartment URLs collected
page 270 apartment URLs collected
page 271 apartment URLs collected
page 272 apartment URLs collected
page 273 apart

In [238]:
collection_url = all_apt_urls[0]
collection_url

'/realestatehomesforsale/1417-n-8th-st-philadelphia-pa-19122-id342778675.html'

In [237]:
normal_url = all_apt_urls[1]
normal_url

'/realestatehomesforsale/767-n-24th-st-philadelphia-pa-19130-gid400025159539.html'

In [261]:
def get_price(soup):
    try:
        price_tag = soup.find('span', class_='listing-detail-price-amount pad-half-right')
        price_text = price_tag.get_text()\
                          .replace(',','')\
                          .strip()
        pattern = r'[-+]?\d*\.\d+|\d+'
        price_unit = re.findall(pattern, price_text)[0]
        price = float(price_unit)

        return price
    except:
        return np.nan

def get_address(content_tag):
    try:
        address_tag = content_tag.find('div', class_='listing-detail-address')
        street_tag = address_tag.find('span', attrs={'itemprop': 'streetAddress'})
        street = street_tag.get_text()\
                           .strip()\
                           .replace(',', '')
        city_tag = address_tag.find('span', attrs={'itemprop': 'addressLocality'})
        city = city_tag.get_text()\
                       .strip()\
                       .replace(',', '')\
                       .title()
        state_tag = address_tag.find('span', attrs={'itemprop': 'addressRegion'})
        state = state_tag.get_text()\
                         .strip()
        zipcode_tag = address_tag.find('span', attrs={'itemprop': 'postalCode'})
        zipcode = zipcode_tag.get_text()\
                             .strip()
        
        return street, city, state, zipcode
    
    except:
        return None, None, None, None
    
def get_sideinfo(content_tag):
    sideinfo = {}
    try:
        apt_info_tag = content_tag.find('div', class_='forsalelistingdetail')
        apt_list_tag = apt_info_tag.find_all('li', class_='listing-detail-stats')
        
        for apt_tag in apt_list_tag:
            spans = apt_tag.find_all('span')
            key = spans[0].get_text()\
                          .strip()
            value = spans[1].get_text()\
                            .strip()
            sideinfo[key] = value
        return sideinfo
    except:
        return sideinfo

def access_dict(d, key):
    try:
        value = d[key]
        if 'sqft' in value:
            value = value.replace(',','')\
                         .replace('sqft', '')\
                         .strip()
        try:
            return float(value)
        except: 
            return value
    except:
        return None

In [294]:
def remax_normal(soup):
    # REMAX normal property
    content_tag = soup.find('div', class_='property-details-body fullwidth-content-container clearfix')
    price = get_price(soup)
    street, city, state, zipcode = get_address(content_tag)
    sidict = get_sideinfo(content_tag)
    listid = access_dict(sidict, 'Listing ID')
    listtype = access_dict(sidict, 'Listing Type')
    bedrooms = access_dict(sidict, 'Bedrooms')
    bathrooms = access_dict(sidict, 'Bathrooms')
    sqft = access_dict(sidict, 'House Size')
    lotsf = access_dict(sidict, 'Lot Size')
    waterfront = access_dict(sidict, 'Waterfront')
    liststatus = access_dict(sidict, 'Listing Status')
    yrbuilt = access_dict(sidict, 'Year Built')
    county = access_dict(sidict, 'County')
    halfbath = access_dict(sidict, 'Half Bath')
    subdivision = access_dict(sidict, 'Subdivision')
    cooling = access_dict(sidict, 'Cooling')
    ac = access_dict(sidict, 'Air Conditioning')
    appliances = access_dict(sidict, 'Appliances')
    rooms = access_dict(sidict, 'Rooms')
    laundry = access_dict(sidict, 'Laundry')
    taxes = access_dict(sidict, 'Taxes')
    luxurious = 'No'

    unit = [
        street,
        city,
        state,
        zipcode,
        bathrooms,
        bedrooms,
        rooms,
        waterfront,
        cooling,
        ac,
        appliances,
        laundry,
        sqft,
        price,
        taxes,
        listtype,
        listid,
        lotsf,
        liststatus,
        yrbuilt,
        county,
        halfbath,
        subdivision,
        luxurious,
    ]

    return unit

In [295]:
def remax_collection(soup):
    # REMAX luxurious property 
    price = get_price_normal(soup)
    content_tag = soup.find('div', class_='property-details--details')
    street, city, state, zipcode = get_address(content_tag)
    sidict = get_sideinfo(content_tag)
    listid = access_dict(sidict, 'Listing ID')
    listtype = access_dict(sidict, 'Listing Type')
    bedrooms = access_dict(sidict, 'Bedrooms')
    bathrooms = access_dict(sidict, 'Bathrooms')
    sqft = access_dict(sidict, 'House Size')
    lotsf = access_dict(sidict, 'Lot Size')
    waterfront = access_dict(sidict, 'Waterfront')
    liststatus = access_dict(sidict, 'Listing Status')
    yrbuilt = access_dict(sidict, 'Year Built')
    county = access_dict(sidict, 'County')
    halfbath = access_dict(sidict, 'Half Bath')
    subdivision = access_dict(sidict, 'Subdivision')
    cooling = access_dict(sidict, 'Cooling')
    ac = access_dict(sidict, 'Air Conditioning')
    appliances = access_dict(sidict, 'Appliances')
    rooms = access_dict(sidict, 'Rooms')
    laundry = access_dict(sidict, 'Laundry')
    taxes = access_dict(sidict, 'Taxes')
    luxurious = 'Yes'

    unit = [
        street,
        city,
        state,
        zipcode,
        bathrooms,
        bedrooms,
        rooms,
        waterfront,
        cooling,
        ac,
        appliances,
        laundry,
        sqft,
        price,
        taxes,
        listtype,
        listid,
        lotsf,
        liststatus,
        yrbuilt,
        county,
        halfbath,
        subdivision,
        luxurious,
    ]

    return unit

In [300]:
def remax_apt(soup, content_tag):
    price = get_price_normal(soup)
    street, city, state, zipcode = get_address(content_tag)
    sidict = get_sideinfo(content_tag)
    listid = access_dict(sidict, 'Listing ID')
    listtype = access_dict(sidict, 'Listing Type')
    bedrooms = access_dict(sidict, 'Bedrooms')
    bathrooms = access_dict(sidict, 'Bathrooms')
    sqft = access_dict(sidict, 'House Size')
    lotsf = access_dict(sidict, 'Lot Size')
    waterfront = access_dict(sidict, 'Waterfront')
    liststatus = access_dict(sidict, 'Listing Status')
    yrbuilt = access_dict(sidict, 'Year Built')
    county = access_dict(sidict, 'County')
    halfbath = access_dict(sidict, 'Half Bath')
    subdivision = access_dict(sidict, 'Subdivision')
    cooling = access_dict(sidict, 'Cooling')
    ac = access_dict(sidict, 'Air Conditioning')
    appliances = access_dict(sidict, 'Appliances')
    rooms = access_dict(sidict, 'Rooms')
    laundry = access_dict(sidict, 'Laundry')
    taxes = access_dict(sidict, 'Taxes')
    luxurious = 'Yes'

    unit = [
        street,
        city,
        state,
        zipcode,
        bathrooms,
        bedrooms,
        rooms,
        waterfront,
        cooling,
        ac,
        appliances,
        laundry,
        sqft,
        price,
        taxes,
        listtype,
        listid,
        lotsf,
        liststatus,
        yrbuilt,
        county,
        halfbath,
        subdivision,
        luxurious,
    ]

    return unit

In [296]:
def check_lux(soup):
    try:
        is_lux = False
        
        lux_tag = soup.find('span', attrs={
            'itemprop': 'name',
            'class': 'js-stateformatted'
        })
        
        lux = lux_tag.get_text()\
                     .strip()\
                     .lower()
        
        if 'luxury' in lux:
            is_lux = True
        return is_lux
    except:
        return False

In [304]:
def get_apt_info(apt_url):
    overhead = 'https://www.remax.com'
    response = requests.get(overhead+apt_url)
    results = response.content  
    if not response.status_code == 404:
        soup = BeautifulSoup(results, 'lxml')
        is_lux = check_lux(soup)
        if is_lux:
            content_tag = soup.find('div', class_='property-details--details')
        else:
            content_tag = soup.find('div', class_='property-details-body fullwidth-content-container clearfix')
        apt_info = remax_apt(soup, content_tag)
    return apt_info

In [305]:
get_apt_info(normal_url)

['767 N 24TH ST',
 'Philadelphia',
 'PA',
 '19130',
 2.0,
 3.0,
 'Breakfast Room,  Dining Room,  Kitchen,  Laundry,  Living Room',
 'No',
 'Central A/C',
 'Yes',
 'Built-In Microwave,  Cooktop,  Dishwasher,  Oven/Range - Gas,  Range Hood',
 None,
 2128.0,
 710000.0,
 7268.0,
 'Condo/Townhome',
 'PAPH849490',
 1742.0,
 'Active',
 1920.0,
 'Philadelphia',
 None,
 'Fairmount',
 'Yes']

In [306]:
get_apt_info(collection_url)

['1417 N 8TH ST',
 'Philadelphia',
 'PA',
 '19122',
 None,
 None,
 None,
 'No',
 'Central A/C',
 'Yes',
 None,
 None,
 None,
 850000.0,
 489.0,
 'Condo/Townhome',
 'PAPH850180',
 1742.0,
 'Active',
 2020.0,
 'Philadelphia County',
 None,
 'Ludlow',
 'Yes']