In [1]:
import pandas as pd
from bs4 import BeautifulSoup
import requests
import numpy as np
import re
import os
import json

In [2]:
def get_buy_webpage(city, 
                    state, 
                    pg_num, 
                    htype=[
                        'house',
                        'multi-family',
                    ]):
    
    overhead = 'https://www.trulia.com'
    dangle = 'for_sale'

    city = city.title()\
               .replace(' ', '_')
    state = state.upper()

    dict_alias = {
        'house': 'SINGLE-FAMILY_HOME',
        'condo': 'APARTMENT,CONDO,COOP',
        'townhouse': 'TOWNHOUSE',
        'multi-family': 'MULTI-FAMILY',
        'land': 'LOT%7CLAND',
        'mobile/manufactured': 'MOBILE%7CMANUFACTURED',
        'other': 'UNKNOWN',
    }

    aliases = [dict_alias[h] for h in htype]
    houses = ','.join(aliases)

    webpage = f'{overhead}/{dangle}/{city},{state}/{houses}_type/{pg_num}_p/'
    return webpage


In [3]:
url = get_buy_webpage('philadelphia', 'pa', 1)

In [4]:
def get_buy_apt_urls_per_page(city,
                              state,
                              pg_num,
                              htype=['house', 
                                     'multi-family']):

    webpage = get_buy_webpage(city, state, pg_num, htype)
    
    # Here we added User-Agent to the header of our request 
    # It is because sometimes the web server will check the
    # different fields of the header to block robot scrapers
    # User-Agent is the most common one because it is specific 
    # to your browser.
    headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) \
            AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'}
    response = requests.get(webpage, headers=headers)
    results = response.content

    if not response.status_code == 404:
        soup = BeautifulSoup(results, 'lxml')
        
        apt_class = 'PropertyCard__PropertyCardContainer-sc-1ush98q-0 gsDQZj Box-sc-8ox7qa-0 jIGxjA'
        apt_tags = soup.find_all('div', class_=apt_class)
        
        apt_link_tags = [tag.find('a') for tag in apt_tags]
        apt_urls = [tag['href'] for tag in apt_link_tags]
    
    return apt_urls
        

In [5]:
apt_urls_pg1 = get_buy_apt_urls_per_page('philadelphia', 'pa', 1)
apt_urls_pg1

['/p/pa/philadelphia/1311-foulkrod-st-philadelphia-pa-19124--2017208781',
 '/p/pa/philadelphia/136-e-herman-st-philadelphia-pa-19144--2090061884',
 '/p/pa/philadelphia/302-carpenter-ln-philadelphia-pa-19119--1037425672',
 '/p/pa/philadelphia/3409-hess-st-philadelphia-pa-19136--2017291136',
 '/p/pa/philadelphia/6523-lincoln-dr-philadelphia-pa-19119--2017190538',
 '/p/pa/philadelphia/1017-surrey-rd-philadelphia-pa-19115--2017166645',
 '/p/pa/philadelphia/6838-ogontz-ave-philadelphia-pa-19138--2017307773',
 '/p/pa/philadelphia/102-w-apsley-st-philadelphia-pa-19144--1139698957',
 '/p/pa/philadelphia/6437-emlen-st-philadelphia-pa-19119--2017190137',
 '/p/pa/philadelphia/5044-homestead-st-philadelphia-pa-19135--2017281711',
 '/p/pa/philadelphia/11610-bustleton-ave-philadelphia-pa-19116--2017179901',
 '/p/pa/philadelphia/6902-epiphany-pl-philadelphia-pa-19128--2017244605',
 '/p/pa/philadelphia/9902-nicklaus-dr-philadelphia-pa-19115--1004594076',
 '/p/pa/philadelphia/1910-rittenhouse-sq-philad

In [6]:
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) \
            AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'}
apt_wb = 'https://www.trulia.com/p/pa/philadelphia/302-carpenter-ln-philadelphia-pa-19119--1037425672'
response = requests.get(apt_wb, headers=headers)
results = response.content
soup = BeautifulSoup(results, 'lxml')

In [7]:
jfile = soup.find('script', attrs={
    'id': '__NEXT_DATA__',
    'type': 'application/json',
}).get_text()


In [8]:
jdict = json.loads(jfile)
for key in jdict.keys():
    print(key)

dataManager
props
page
query
buildId
assetPrefix
runtimeConfig


In [9]:
jdict['props'].keys()

dict_keys(['abExperiments', 'abTracking', 'session', '_page', 'asPath', 'homeDetails', 'viewer', 'search', 'homeDetailsPromise', 'apolloState', 'apolloHeaders'])

In [10]:
jdict['props']['homeDetails'].keys()

dict_keys(['url', 'media', 'metadata', 'pageText', 'currentStatus', '__typename', 'tracking', 'primaryNavigation', 'secondaryNavigation', 'isSaveable', 'isShareable', 'preferences', 'location', 'adTargetings', 'price', 'heroTags', 'priceChange', 'bedrooms', 'bathrooms', 'floorSpace', 'provider', 'activeForRentListing', 'surroundings', 'hoaFee', 'mortgageInfo', 'nearbyHomes', 'features', 'publicRecord', 'comparables', 'description', 'titleToPriceHistory', 'priceHistory', 'localProtections', 'taxes', 'marketComparisons', 'breadcrumbNavigation', 'backToSearch', 'localInfoSummary', 'parcel', 'nearbyPointsOfInterest', 'seoDescription', 'similarHomes', 'foreclosureInfo', 'community', 'activeForSaleListing', 'assignedSchools', 'providerListingId', 'flagListingReportTypes', 'affordability', 'propertyType'])

In [11]:
jdict['props']['homeDetails']['media'].keys()

dict_keys(['metaTagHeroImages', '__typename', 'topThirdHeroImages', 'totalPhotoCount', 'threeDHomes', 'mapWithPin', 'videos', 'photos', 'streetView'])

In [12]:
jdict['props']['homeDetails']['media']['photos'][2]

{'url': {'extraSmallSrc': 'https://static.trulia-cdn.com/pictures/thumbs_5/zillowstatic/ISb91c89ycbre71000000000.jpg',
  'smallSrc': 'https://static.trulia-cdn.com/pictures/thumbs_6/zillowstatic/ISb91c89ycbre71000000000.jpg',
  'mediumSrc': 'https://static.trulia-cdn.com/pictures/thumbs_6/zillowstatic/ISb91c89ycbre71000000000.jpg',
  'largeSrc': 'https://static.trulia-cdn.com/pictures/thumbs_6/zillowstatic/ISb91c89ycbre71000000000.jpg',
  'hiDpiExtraSmallSrc': 'https://static.trulia-cdn.com/pictures/thumbs_6/zillowstatic/ISb91c89ycbre71000000000.jpg',
  'hiDpiSmallSrc': 'https://static.trulia-cdn.com/pictures/thumbs_6/zillowstatic/ISb91c89ycbre71000000000.jpg',
  'hiDpiMediumSrc': 'https://static.trulia-cdn.com/pictures/thumbs_6/zillowstatic/ISb91c89ycbre71000000000.jpg',
  'hiDpiLargeSrc': 'https://static.trulia-cdn.com/pictures/thumbs_6/zillowstatic/ISb91c89ycbre71000000000.jpg',
  '__typename': 'MEDIA_ImageResource'},
 'webpUrl': {'extraSmallWebpSrc': 'https://static.trulia-cdn.com/

In [13]:
image_url = 'https://static.trulia-cdn.com/pictures/thumbs_5/zillowstatic/ISb91c89ycbre71000000000.jpg'
img_data = requests.get(image_url).content
with open('image_name.jpg', 'wb') as handler:
    handler.write(img_data)

In [14]:
jdict['props']

{'abExperiments': {'1148': 'a',
  '1177': 'a',
  '1181': 'b',
  '1182': 'b',
  '1186': 'c'},
 'abTracking': '1148a;1177a;1181b;1182b;1186c',
 'session': {'isBot': False,
  'userLifetimeId': '191119q172d112t1g96g98vh3e1km711',
  'requestId': '19eb300404688763a62f9c100c6a41c0',
  'navigatedFrom': False},
 '_page': {'title': '302 Carpenter Ln, Philadelphia, PA 19119 - 5 Bed, 3 Bath Single-Family Home - MLS# PAPH847616 - 29 Photos | Trulia',
  'description': "302 Carpenter Ln, Philadelphia, PA 19119 is a 3,012 sqft, 5 bed, 3 bath Single-Family Home listed for $650,000. Classic 3 story stone Queen Anne Victorian circa 1895 that has been updated for today's living! Features include a...",
  'metaTags': {'robots': {'name': 'robots', 'content': 'INDEX, FOLLOW'},
   'twitterCard': {'property': 'twitter:card', 'content': 'summary'},
   'twitterImageAlt': {'property': 'twitter:image:alt',
    'content': '302 Carpenter Ln, Philadelphia, PA 19119'},
   'ogUrl': {'property': 'og:url',
    'content':

In [15]:
def scrape_json(soup):
    jfile = soup.find('script', attrs={
        'id': '__NEXT_DATA__',
        'type': 'application/json',
    }).get_text()
    
    jdict = json.loads(jfile)
    return jdict

In [16]:
def scrape_img_urls(jdict):
    pics = jdict['props']['homeDetails']['media']['photos']
    urls = [pic['url']['mediumSrc'] for pic in pics]
    return urls

In [17]:
def save_images(img_urls, data_path, img_type, address):
    try:
        current_path = os.getcwd()
        os.chdir(data_path)
        
        if not os.path.exists(img_type):
            os.mkdir(img_type)
        os.chdir(img_type)
        
        if not os.path.exists(address):
            os.mkdir(address)
        os.chdir(address)
        
        for i, img_url in enumerate(img_urls):
            img_data = requests.get(img_url).content
            with open('img{i}.jpg', 'wb') as handler:
                handler.write(img_data)
                
        os.chdir(current_path)
        return 1
    
    except:
        return 0

In [18]:
test_jdict = scrape_json(soup)
test_img_urls = scrape_img_urls(test_jdict)
test_img_urls

['https://static.trulia-cdn.com/pictures/thumbs_6/zillowstatic/ISvgw4ch9cdv430000000000.jpg',
 'https://static.trulia-cdn.com/pictures/thumbs_6/zillowstatic/IS7mojyi16a54a1000000000.jpg',
 'https://static.trulia-cdn.com/pictures/thumbs_6/zillowstatic/ISb91c89ycbre71000000000.jpg',
 'https://static.trulia-cdn.com/pictures/thumbs_6/zillowstatic/ISrhegouoc1k1l0000000000.jpg',
 'https://static.trulia-cdn.com/pictures/thumbs_6/zillowstatic/ISrxuf38nbz82l0000000000.jpg',
 'https://static.trulia-cdn.com/pictures/thumbs_6/zillowstatic/ISnqubpplomze71000000000.jpg',
 'https://static.trulia-cdn.com/pictures/thumbs_6/zillowstatic/ISfksz6jhge2f71000000000.jpg',
 'https://static.trulia-cdn.com/pictures/thumbs_6/zillowstatic/IS7eqnocd865f71000000000.jpg',
 'https://static.trulia-cdn.com/pictures/thumbs_6/zillowstatic/ISz7ob6690y7f71000000000.jpg',
 'https://static.trulia-cdn.com/pictures/thumbs_6/zillowstatic/ISr1mznz4spaf71000000000.jpg',
 'https://static.trulia-cdn.com/pictures/thumbs_6/zillowstat

In [22]:
save_images(test_img_urls, '../data/sample/trulia/imgdata', 'sold', 'test_address')

1

In [None]:
os.chdir(os.getcwd())