In [283]:
import pandas as pd
from bs4 import BeautifulSoup
import requests
import numpy as np
import re

In [344]:
def _get_page_url(city, state, page_num):
    city = city.lower().replace(' ', '-')
    state = state.lower().replace(' ', '-')
    page = f'https://www.rent.com/{state}/{city}/apartments_condos_houses_townhouses?page={page_num}'
    return page

In [345]:
def _get_apt_urls_per_page(city, state, pg_num):
    pg_url = _get_page_url(city, state, pg_num)
    response = requests.get(pg_url)
    results = response.content
    apt_urls = []
    if not response.status_code == 404:
        soup = BeautifulSoup(results, 'lxml')
        apts = soup.find_all('div', class_='_3PdAH _1EbNE')
        for apt in apts:
            apt_sub = apt.find('div', class_='_3RRl_ _2Hrxl')
            apt_link = apt_sub.find('a', class_='_3kMwn ByXwK')
            url = apt_link['href']
            apt_urls.append(url)
    
    return apt_urls

In [346]:
def get_apt_urls(city, state, verbose=False):
    pg_url = _get_page_url(city, state, 1)
    response = requests.get(pg_url)
    results = response.content
    page_num = 0
    apt_urls = []
    
    if not response.status_code == 404:
        soup = BeautifulSoup(results, 'lxml')
        apts_num_tag = soup.find('span', class_='_3YJue')
        apts_num =  apts_num_tag.find('span', 
                                      attrs={'data-tid':'pagination-total'})\
                                .get_text()
        apts_num = int(apts_num)
        pages_num = int(np.ceil(apts_num/30))
        if verbose:
            print(f'total number of apartments in {city}, {state} is {apts_num}')
            print(f'total number of pages to be scraped is {pages_num}')
        
    for pg_num in range(pages_num):
        apt_urls += _get_apt_urls_per_page(city, state, pg_num)
        if verbose:
            print(f'page {pg_num} done')
    
    apt_urls = [url for url in apt_urls if state in url]
    
    return apt_urls

In [347]:
apt_urls = get_apt_urls('philadelphia', 'pennsylvania', verbose=True)

total number of apartments in philadelphia, pennsylvania is 2714
total number of pages to be scraped is 91
page 0 done
page 1 done
page 2 done
page 3 done


KeyboardInterrupt: 

In [363]:
def _get_address(address_tag):
    elements = address_tag.find_all('span')
    address = elements[0].get_text()\
                         .replace(',','')\
                         .strip()
    city = elements[1].get_text().strip()
    state = elements[2].get_text().strip()
    zipcode = elements[3].get_text().strip()
    return address, city, state, zipcode

def _get_units(unit_tag):
    unit = []
    for cell in unit_tag.find_all('td'):
        if cell.attrs:
            if cell['data-tid'] == 'pdpfloorplans-unit-displayText':
                unit_num = cell.get_text()
                unit.append(unit_num)
            if cell['data-tid'] == 'pdpfloorplans-unit-price':
                try:
                    unit_price = cell.get_text().replace('$', '')
                    unit.append(float(unit_price))
                except:
                    unit.append(np.nan)
            if cell['data-tid'] == 'pdpfloorplans-unit-bedbath':
                try:
                    bedbath_tag = cell.find_all('span')
                    bed_tag, bath_tag = bedbath_tag[0], bedbath_tag[1]
                    pattern = r'[-+]?\d*\.\d+|\d+'
                    bed = re.findall(pattern, bed_tag.get_text())
                    bath = re.findall(pattern, bath_tag.get_text())
                    bed_unit, bath_unit = 0, 0
                    if bed:
                        bed_unit = bed[0]
                    if bath:
                        bath_unit = bath[0]
                    unit.append(float(bed_unit))
                    unit.append(float(bath_unit))
                except:
                    unit.append(np.nan)
                    unit.append(np.nan)
            if cell['data-tid'] == 'pdpfloorplans-unit-sqft':
                try:
                    pattern = r'[-+]?\d*\.\d+|\d+'
                    sqft_unit = re.findall(pattern, cell.get_text())[0]
                    unit.append(float(sqft_unit))
                except:
                    unit.append(np.nan)
    return unit

def _get_floorplan(unit_tag):
    unit = []
    for cell in unit_tag.find_all('td'):
        if cell.attrs:
            if cell['data-tid'] == 'pdpfloorplan-displayText':
                floorplan_num = cell.get_text()
                unit.append(floorplan_num)
            if cell['data-tid'] == 'pdpfloorplan-price':
                try:
                    fp_price = cell.get_text()\
                                   .replace('$','')\
                                   .replace(',','')
                    pattern = r'[-+]?\d*\.\d+|\d+'
                    price = re.findall(pattern, fp_price)[0]
                    unit.append(float(price))
                except:
                    unit.append(np.nan)
            if cell['data-tid'] == 'pdpfloorplan-bedbaths':
                try:
                    bedbath_tag = cell.find_all('span')
                    bed_tag, bath_tag = bedbath_tag[0], bedbath_tag[1]
                    pattern = r'[-+]?\d*\.\d+|\d+'
                    bed = re.findall(pattern, bed_tag.get_text())
                    bath = re.findall(pattern, bath_tag.get_text())
                    bed_fp, bath_fp = 0, 0
                    if bed:
                        bed_fp = bed[0]
                    if bath:
                        bath_fp = bath[0]
                    unit.append(float(bed_fp))
                    unit.append(float(bath_fp))
                except:
                    unit.append(np.nan)
                    unit.append(np.nan)
            if cell['data-tid'] == 'pdpfloorplan-sqft':
                try:
                    pattern = r'[-+]?\d*\.\d+|\d+'
                    sqft_fp = re.findall(pattern, cell.get_text())[0]
                    unit.append(float(sqft_fp))
                except:
                    unit.append(np.nan)
    return unit

def _get_apt_info(apt_url):
    overhead = 'https://www.rent.com'
    complete_url = overhead+apt_url
    response = requests.get(complete_url)
    results = response.content
    apt_all = []
    
    if not response.status_code == 404:
        soup = BeautifulSoup(results, 'lxml')
        address_tag = soup.find('div', '_3wnFl _3wnFl')
        addr = _get_address(address_tag)
        
        room_tags = soup.find_all('div', '_1ECa-')
        
        for rt in room_tags:
            room_table = rt.find('table', '_1GkPp F4skJ')
            room_tbody = room_table.find('tbody')
            floor_plan = room_tbody.find_all('tr')
            apartments = []
            for unit_tag in floor_plan:
                if unit_tag['data-tid'] == 'pdpfloorplan-row':
                    apt = addr+_get_floorplan(unit_tag)
                    apartments.append(apt)

                if unit_tag['data-tid'] == 'pdpfloorplans-unit-row':
                    apt = list(addr)+_get_units(unit_tag)
                    apartments.append(apt)
            apt_all.append(apartments)
                        
    return apt_all

In [365]:
apt_url = apt_urls[1]
_get_apt_info(apt_url)

('1520 Hamilton Street', 'Philadelphia', 'Pennsylvania', '19130')


TypeError: can only concatenate tuple (not "list") to tuple

In [354]:
apt_url

'/pennsylvania/philadelphia-apartments/the-hamilton-4-100068395'

In [353]:
overhead = 'https://www.rent.com'
complete_url = overhead+'/pennsylvania/philadelphia-apartments/1213-walnut-4-100064785'
response = requests.get(complete_url)
results = response.content

if not response.status_code == 404:
    soup = BeautifulSoup(results, 'lxml')
    address_tag = soup.find('div', '_3wnFl _3wnFl')
    addr = _get_address(address_tag)

    room_tags = soup.find_all('div', '_1ECa-')

    rt1 = room_tags[0]
rt1

<div class="_1ECa-"><div class="_3tvKg" data-tid="pdp-floorplan-list-header">STUDIO</div><table class="_1GkPp F4skJ" data-tid="pdpfloorplan-table"><tbody><tr class="omeVN q6BdK" data-tid="pdpfloorplan-row"><td><div class="lazyload-placeholder" style="height:60px"></div></td><td data-tid="pdpfloorplan-displayText">Studio-S3</td><td data-tid="pdpfloorplan-price">$1,930 - $2,455/mo</td><td data-tid="pdpfloorplan-bedbaths"><div><span>Studio | </span><span>1 Bath</span></div></td><td data-tid="pdpfloorplan-sqft">433 Sqft</td><td class="_2s8ma" data-tid="pdpfloorplans-available">4 Available</td><td><span class="_2owZk _223Z4" data-tid="pdpfloorplans-expand-units" role="presentation"><svg data-tid="pdpfloorplans-expand-units" viewbox="0 0 12 9"><path d="M11.739 1.776A1.121 1.121 0 0 0 11.715.31a.92.92 0 0 0-1.353-.026L6 5.015 1.638.283A.92.92 0 0 0 .285.31a1.121 1.121 0 0 0-.024 1.467L6 8l1.376-1.493 4.363-4.73z" fill-rule="evenodd"></path></svg></span></td></tr><tr class="j4z-D" data-tid="pd