In [2]:
from bs4 import BeautifulSoup
import numpy as np
import pandas as pd
import requests
import lxml
import numbers
from collections import defaultdict
from uszipcode import *

import sys
sys.path.insert(2, './apartments-scraper-master/')

from parse_apartments import *

pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_columns', None)

def load_big_city(change_rate=0.05):
    df = pd.read_csv('./census_data/us_census_data_2022.csv')
    df = df[df['growth']>=change_rate]
    df = df.groupby("usps").filter(lambda x: len(x) >= 3)
    
    print(df['usps'].value_counts())
    return df
    
def get_all_city(population_limit=10000, density_limit=500, county='King County', major_city='Seattle'):
    engine = SearchEngine()
    allzips = {}
    for i in range(100000): 
        zipcode = str(i).zfill(5)
        try: allzips[zipcode] = engine.by_zipcode(zipcode).to_dict()
        except: pass

    allzips = pd.DataFrame(allzips).T.reset_index(drop = True)
    def city_apartment_url(x):
        url = 'https://www.apartments.com/' + x.major_city.lower() + '-' + x.state.lower()
        return url

    def city_zillow_url(x):
        url = 'http://www.zillow.com/homes/for_sale/' + x.major_city.lower() 
        return url

    allzips['apartment_URL'] = allzips.apply(city_apartment_url, axis=1)
    allzips['zillow_URL'] = allzips.apply(city_zillow_url, axis=1)
    allzips['search_key'] = allzips['major_city'].replace(' ','').str.lower()  + '-' + allzips['state'].str.lower() + '-' + allzips.zipcode
    
    allzips = allzips[(allzips['population']>=population_limit) & (allzips['population_density']>=density_limit)]
    
    big_city = load_big_city()
    # allzips = allzips[~allzips['major_city'].isin(big_city['usps'])]

    if county is not None:
        allzips = allzips[(allzips['county']==county)]
    elif major_city is not None: 
        allzips = allzips[(allzips['major_city']==major_city)]
    else: 
        allzips = allzips
    
    print("Number of city:{}".format(str(allzips.shape[0])))
    return allzips


def get_apt_url(search_key):
    # search_key = 'chicago-il-60615'
    url = 'https://www.apartments.com/' + search_key
    headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'}
    
    page = requests.get(url, headers=headers)
    soup = BeautifulSoup(page.content, 'html.parser')
    soup.prettify()

    supply = soup.find('h3', {"class": "count"})
    supply_str = supply.text.strip()
    total_num_pages = soup.find('p', {"class": "searchResults"})
    num_string = total_num_pages.text.strip() if total_num_pages is not None else '1'
    num_page = int(num_string.split()[-1])
    print("Number of page:{}".format(str(num_page)))

    data = []

    for i in range(1,num_page+1):
        url_with_page = url + '/' + str(i) + '/'
        print(url_with_page)
        page = requests.get(url_with_page, headers=headers)
        soup = BeautifulSoup(page.content, 'html.parser')
        soup.prettify()

        df = pd.DataFrame()
        link = [x.article['data-url'] for x in soup.findAll('li',{'class':'mortar-wrapper'})]
        address = [x.article['data-streetaddress'] for x in soup.findAll('li',{'class':'mortar-wrapper'})]
        df['link'] = link
        df['address'] = address
        df['supply'] = supply_str
        print("Number of listing:{}".format(str(df.shape[0])))

        data.append(df)

    data = pd.concat(data)
    data['search_key'] = search_key
    return data


def get_apt_price_detail(url):
    # https://www.apartments.com/regents-park-chicago-il/5sl3n3q/
    print(url)
    headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'}

    page = requests.get(url, headers=headers)
    soup = BeautifulSoup(page.content, 'html.parser')
    soup.prettify()

    df = pd.DataFrame()
    name = [x.text.strip() for x in soup.findAll('h1',{'class':'propertyName'})]
    zipcode = [x.text.strip() for x in soup.findAll('span',{'class':'stateZipContainer'})]
    neighborhood = [x.text.strip() for x in soup.findAll('a',{'class':'neighborhood'})]
    price_detail = [x.text for x in soup.findAll('span',{'class':'rentLabel'})]
    model_detail = [x.text for x in soup.findAll('span',{'class':'detailsTextWrapper'})]

    model_detail = list(map(lambda a: a.strip(), model_detail))
    model_detail = model_detail[::2]

    element = str([x.text.strip() for x in soup.findAll('div',{'class':'column'})])
    home_type = str([x.text.strip() for x in soup.findAll('script',{'type':'text/javascript'})])

    df['price_detail'] = price_detail
    df['model_detail'] = model_detail

    df['min_price'] = df['price_detail'].str.split('–').str[0]
    df['max_price'] = df['price_detail'].str.split('–').str[1]

    df['price_detail'] = df['price_detail'].astype('str')
    df['model_detail'] = df['model_detail'].astype('str')

    df.replace('\r\n','', regex=True, inplace=True)
    df['model'] = df['model_detail'].str.split(',').str[0]
    df['model'] = df['model'].astype('str')
    df['bath'] = df['model_detail'].str.split(',').str[1]
    df['bath'] = df['bath'].astype('str')
    df['sqft'] = df['model_detail'].str.split(' ').str[-3]
    df['sqft'] = pd.to_numeric(df['sqft'].str.replace('[^-.0-9]','',regex=True))
    df['url'] = url
    df['property_name'] = str(*name)
    df['state'] = str(*zipcode).split('\n')[0]
    df['zipcode'] = str(*zipcode).split('\n')[1]
    df['neighborhood'] = str(*neighborhood)

    year = element[element.find('Built in')+9:element.find('Built in')+13]
    df['year'] = int(year) if year[0].isdigit() else np.NaN
    home_type = home_type[home_type.find('propertyType')+16:home_type.find('listingNeighborhood')-19]
    df['home_type'] = str(home_type) if type(home_type) == str else np.NaN
    df['home_type'] = df['home_type'].str.split("\'").str[0]

    df['min_price'] = pd.to_numeric(df['min_price'].str.replace('[^-.0-9]','',regex=True)) if df['min_price'].dtype == 'object' else np.NaN
    df['max_price'] = pd.to_numeric(df['max_price'].str.replace('[^-.0-9]','',regex=True)) if df['min_price'].dtype == 'object' else np.NaN
    df['avg_price'] = (df['min_price'] + df['max_price'])/2 if df['max_price'].notnull().values.any() else df['min_price']

    return df[['state','zipcode','neighborhood','property_name','year','home_type','model','avg_price','min_price','max_price','bath','sqft','url','model_detail']]


def get_apt_nearby_detail(url):
    # url = https://www.apartments.com/regents-park-chicago-il/5sl3n3q/
    print(url)
    headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'}
    
    page = requests.get(url, headers=headers)
    first_df = pd.read_html(page.text)[0]
    a = 1 if len(first_df.columns) == 2 else 2

    df_edu = pd.read_html(page.text)[a]
    df_edu.columns = ['name', 'method', 'time', 'Distance']
    df_edu['type'] = 'edu'

    df_subway = pd.read_html(page.text)[a+1]
    df_subway.columns = ['name', 'method', 'time', 'Distance']
    df_subway['type'] = 'subway'

    df_rail = pd.read_html(page.text)[a+2]
    df_rail.columns = ['name', 'method', 'time', 'Distance']
    df_rail['type'] = 'rail'

    df_airport = pd.read_html(page.text)[a+3]
    df_airport.columns = ['name', 'method', 'time', 'Distance']
    df_airport['type'] = 'airport'

    df_shopping = pd.read_html(page.text)[a+4]
    df_shopping.columns = ['name', 'method', 'time', 'Distance']
    df_shopping['type'] = 'shopping'

    df_park = pd.read_html(page.text)[a+5]
    df_park.columns = ['name', 'method', 'time', 'Distance']
    df_park['type'] = 'park'

    df_military = pd.read_html(page.text)[a+6]
    df_military.columns = ['name', 'method', 'time', 'Distance']
    df_military['type'] = 'military'

    frames = [df_edu,df_subway,df_rail,df_airport,df_shopping,df_park,df_military]
    data = pd.concat(frames)
    data['url'] = url
    data.replace(':','', regex=True, inplace=True)    
    
    return data[['url','type','name', 'method', 'time', 'Distance']]


def wrap_apt(search_key):
    #search_key = 'chicago-il-60651'
    final = []
    df = get_apt_url(search_key) 

    for index, row in df.iterrows():
        url = row['link']
        df = get_apt_price_detail(url)
        df['address'] = row['address']
        df['search_key'] = row['search_key']
        df['supply'] = row['supply']
        final.append(df)

    final = pd.concat(final)
    return final[['search_key','supply','state','zipcode','neighborhood','address','property_name','year','home_type','model','avg_price','min_price','max_price','bath','sqft','url','model_detail']].drop_duplicates()

def get_apt_data(population_limit=10000, density_limit=500, county='King County', major_city='Seattle'):
    final = []
    us_data = get_all_city(population_limit=population_limit, density_limit=density_limit, county=county, major_city=major_city)
    for index, row in us_data.iterrows():
        print(row['search_key'])
        try: df = wrap_apt(row['search_key'])
        except: pass
        df['major_city'] = row['major_city']
        df['county'] = row['county']
        df['timezone'] = row['timezone']
        df['radius_in_miles'] = row['radius_in_miles']
        df['population'] = row['population']
        df['population_density'] = row['population_density']
        df['land_area_in_sqmi'] = row['land_area_in_sqmi']
        df['water_area_in_sqmi'] = row['water_area_in_sqmi']
        df['housing_units'] = row['housing_units']
        df['occupied_housing_units'] = row['occupied_housing_units']
        df['median_home_value'] = row['median_home_value']
        df['median_household_income'] = row['median_household_income']

        final.append(df)

    final = pd.concat(final)
    final=final.replace('ApartmentUnitForRent','Apartment')
    
    return final