# Scrape DC Apartment Listings from ApartmentList.com

In [18]:
# Load packages 

import requests  
from bs4 import BeautifulSoup
import pandas as pd
import time

# # Many thanks to Ann Mohan Kunnath for the code to scrape Apartments.com
# (https://towardsdatascience.com/an-introduction-to-web-scraping-with-python-bc9563fe8860)
# It has been partially repurposed here for ApartmentList.com!

# Read in some initial variables that we'll need to capture the data
building_name = []
building_address = []
building_description = []
building_units_avail = []
apt_type = []
apt_rent = []
apt_footage = []
apts_list = []


# Base URL - apartments in Washington, DC
base_url = 'https://www.apartmentlist.com/dc/washington'
response = requests.get(base_url)
if response.status_code == 200:
    page = response.content
    sopa = BeautifulSoup(page, 'html.parser')
    
    # Dynamically grab the last pagination number
    paging = sopa.find('nav', {'class':'MuiPagination-root'}).find_all('li')
    start_page = 1
    last_page = str(paging[len(paging)-2].text)
    
    # Start working through each of the pages
    for page_number in range(int(start_page),int(last_page) + 1):
        url = '{}/page-{}'.format(base_url, str(page_number))
        response = requests.get(url)
        if response.status_code == 200:  
            page = response.content
            sopa = BeautifulSoup(page,"html.parser")    
            
            # Get building-level information
            for listing in sopa.find_all('div', {'class', 'ListingCard'}): 
                building_name = listing.find('div', {'class', 'css-d6gmau e1k7pw6k4'}).text
                building_address = listing.find('div', {'class', 'css-17xjl8p e1k7pw6k5'}).text
                building_description = listing.find('div', {'class', 'css-11wmgwu e1k9ondy1'}).text
                building_units_avail = listing.find('div', {'class', 'css-1qplh4f e1k7pw6k2'}).text

                # Get apartment-level information
                for apts in listing.find_all('div', {'class', 'css-1oxqqna e1i6tqc31'}):
                    apt_type = apts.find('div', {'class', 'css-xjvzth e1i6tqc32'}).text
                    apt_rent = apts.find('div', {'class', 'css-ajwnv4 e1i6tqc33'}).text
                    if apts.find('div', {'class', 'css-o1qo1i e1i6tqc34'}):
                        apt_footage = apts.find('div', {'class', 'css-o1qo1i e1i6tqc34'}).text
                    else:
                        apt_footage = 'NA'

                    # Add this to a list
                    apts_list.append([building_name,
                                 building_address,
                                 building_description,
                                 building_units_avail,
                                 apt_type,
                                 apt_rent,
                                 apt_footage])

                # Print building name when done
                print('Page {}: {}'.format(page_number, building_name))

                # Give the site a chance to recover
                time.sleep(2)

# Make this a dataframe
apts_df = pd.DataFrame(apts_list)
apts_df.columns = ['building_name','building_address','building_description', 'building_units_available',
                     'apt_type','apt_rent','apt_footage']

# Read it out
apts_df.to_csv("ApartmentList_Information.csv")

print('Scrape Complete! Look in your current folder for results of the search.')

Page 1: 1500 Mass
Page 1: Park Connecticut
Page 1: 3003 Van Ness
Page 1: 100K
Page 1: 455 Eye Street
Page 1: Connecticut Heights
Page 1: The Flats at Dupont Circle
Page 1: Alban Towers
Page 1: Calvert Woodley
Page 1: 2501 Porter
Page 1: Cleveland House
Page 1: Corcoran House at Dupont Circle
Page 1: 425 Mass
Page 1: 2400 M
Page 1: 1210 Mass
Page 1: RiverPoint
Page 1: The Kelvin
Page 1: Senate Square
Page 1: 1331
Page 1: 555
Page 1: Resa
Page 1: The Batley
Page 1: 14W Apartments
Page 1: The Channel
Scrape Complete! Look in your current folder for results of the search.
