## 1) Importing Libraries

In [1]:
from bs4 import BeautifulSoup
import requests
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
import pandas as pd

## 2) HTTP Request

#### a) Storing the website:

In [2]:
url = 'https://www.trulia.com/CA/San_Diego/'

#### b) Get Request:

session = requests.Session()
retry = Retry(connect=3, backoff_factor=0.5)
adapter = HTTPAdapter(max_retries=retry)
session.mount('http://', adapter)
session.mount('https://', adapter)

session.get(url)

In [3]:
response = requests.get(url)

#### c) Status Code:

In [4]:
response.status_code

200

## 3) Soup Object

In [5]:
soup = BeautifulSoup(response.content, 'html.parser')

In [6]:
soup

<!DOCTYPE html>
<html lang="en"><head><meta charset="utf-8"/><script>
            window.__uspapi = function(command, version, callback) {
              try {
                if (command === 'getUSPData') {
                  var cookies = document.cookie.split(';');
                  for (var i = 0; i < cookies.length; i++) {
                    var cookie = cookies[i];
                    var separatorIndex = cookie.indexOf('=');
                    separatorIndex = separatorIndex < 0 ? cookie.length : separatorIndex;
                    var cookie_name = decodeURIComponent(cookie.slice(0, separatorIndex).replace(/^\s+/, ''));
                    if (cookie_name === 'usprivacy') {
                      var uspString = decodeURIComponent(cookie.slice(separatorIndex + 1));
                      callback({ version: version, uspString: uspString }, true);
                      return;
                    }
                  }
                }
              } catch (ex) {
                

## 4) Results

In [7]:
result_container = soup.find_all('li', {'class':'SearchResultsList__WideCell-b7y9ki-2'})

In [8]:
len(result_container)

42

## 5) Update Results

**We need the elements which have the attribute 'data-testid':**

In [9]:
results_update = []

for r in result_container:
    if r.has_attr('data-testid'):
        results_update.append(r)

In [10]:
len(results_update)

40

## 6) Concatenate 2 Url Parts to get absolute url

#### a) Url part I:

In [11]:
url_1 = 'https://www.trulia.com'

#### b) Creating a list for Url part II:

In [12]:
url_2 = []

for i in results_update:
    for j in i.find_all('div', {'data-testid':'property-card-details'}):
        url_2.append(j.find('a').get('href'))

In [13]:
url_2

['/p/ca/san-diego/1907-robinson-ave-108-san-diego-ca-92104--2079701553',
 '/p/ca/san-diego/5250-lenox-dr-san-diego-ca-92114--2079814944',
 '/p/ca/san-diego/7720-tyrolean-rd-san-diego-ca-92126--2079935792',
 '/p/ca/san-diego/6124-artisan-way-san-diego-ca-92130--2365905698',
 '/p/ca/san-diego/12614-darkwood-rd-san-diego-ca-92129--2079977957',
 '/p/ca/san-diego/3078-broadway-107-san-diego-ca-92102--2079683296',
 '/p/ca/san-diego/12048-rue-des-amis-san-diego-ca-92131--2080015943',
 '/p/ca/san-diego/2287-loring-st-1-san-diego-ca-92109--2079762188',
 '/p/ca/san-diego/4671-hamilton-st-4-san-diego-ca-92116--2079847521',
 '/p/ca/san-diego/1951-47th-st-146-san-diego-ca-92102--2079678847',
 '/p/ca/san-diego/5170-clairemont-mesa-blvd-51-18-san-diego-ca-92117--2504280231',
 '/p/ca/san-diego/10676-rancho-carmel-dr-san-diego-ca-92128--2079957515',
 '/p/ca/san-diego/4864-49th-st-san-diego-ca-92115--1020988340',
 '/p/ca/san-diego/303-s-46th-st-san-diego-ca-92113--2146021556',
 '/p/ca/san-diego/4545-col

In [14]:
len(url_2)

40

#### c) Joining Url 1 and Url 2:

In [15]:
import urllib.parse

url_joined = []

for i in url_2:
    url_joined.append(urllib.parse.urljoin(url_1, i))
    

In [16]:
url_joined

['https://www.trulia.com/p/ca/san-diego/1907-robinson-ave-108-san-diego-ca-92104--2079701553',
 'https://www.trulia.com/p/ca/san-diego/5250-lenox-dr-san-diego-ca-92114--2079814944',
 'https://www.trulia.com/p/ca/san-diego/7720-tyrolean-rd-san-diego-ca-92126--2079935792',
 'https://www.trulia.com/p/ca/san-diego/6124-artisan-way-san-diego-ca-92130--2365905698',
 'https://www.trulia.com/p/ca/san-diego/12614-darkwood-rd-san-diego-ca-92129--2079977957',
 'https://www.trulia.com/p/ca/san-diego/3078-broadway-107-san-diego-ca-92102--2079683296',
 'https://www.trulia.com/p/ca/san-diego/12048-rue-des-amis-san-diego-ca-92131--2080015943',
 'https://www.trulia.com/p/ca/san-diego/2287-loring-st-1-san-diego-ca-92109--2079762188',
 'https://www.trulia.com/p/ca/san-diego/4671-hamilton-st-4-san-diego-ca-92116--2079847521',
 'https://www.trulia.com/p/ca/san-diego/1951-47th-st-146-san-diego-ca-92102--2079678847',
 'https://www.trulia.com/p/ca/san-diego/5170-clairemont-mesa-blvd-51-18-san-diego-ca-92117--

In [17]:
len(url_joined)

40

## 7) Get the data from the first link

#### a) Store the first link:

In [18]:
first_link = url_joined[0]

#### b) Get request and soup object:

In [19]:
response = requests.get(first_link)

In [20]:
soup = BeautifulSoup(response.content, 'html.parser')

In [21]:
soup

<!DOCTYPE html>
<html lang="en"><head><meta charset="utf-8"/><script>
            window.__uspapi = function(command, version, callback) {
              try {
                if (command === 'getUSPData') {
                  var cookies = document.cookie.split(';');
                  for (var i = 0; i < cookies.length; i++) {
                    var cookie = cookies[i];
                    var separatorIndex = cookie.indexOf('=');
                    separatorIndex = separatorIndex < 0 ? cookie.length : separatorIndex;
                    var cookie_name = decodeURIComponent(cookie.slice(0, separatorIndex).replace(/^\s+/, ''));
                    if (cookie_name === 'usprivacy') {
                      var uspString = decodeURIComponent(cookie.slice(separatorIndex + 1));
                      callback({ version: version, uspString: uspString }, true);
                      return;
                    }
                  }
                }
              } catch (ex) {
                

#### Address:

In [22]:
soup.find('span', {'data-testid':'home-details-summary-headline'}).get_text()

'1907 Robinson Ave #108'

#### Bedrooms:

In [23]:
soup.find('li', {'data-testid':'bed'}).get_text()

'2 Beds'

#### Bathrooms:

In [24]:
soup.find('li', {'data-testid':'bath'}).get_text()

'1 Bath'

#### Sqft:

In [25]:
soup.find('li', {'data-testid':'floor'}).get_text()

'862 sqft (on 0.51 acres)'

#### Year built:

In [26]:
soup.find('div', string='Year Built').findNext('div').get_text()

'1976'

#### Parking:

In [27]:
soup.find('div', string='Parking').findNext('div').get_text()

'Garage'

#### Pool:

In [28]:
soup.find('div', string='Pool').findNext('div').get_text()

'Pool'

#### Price:

In [29]:
soup.find('h3', {'data-testid':'on-market-price-details'}).get_text()

'$558,100'

#### Put all together in page 1

In [30]:
address = []
bedrooms = []
bathrooms = []
sqft = []
year_built = []
parking = []
pool = []
price = []

# loop through all joined links
for i in url_joined:
    response = requests.get(i)
    
    #create soup object
    soup = BeautifulSoup(response.content, 'html.parser')

    #address
    try:
        address.append(soup.find('span', {'data-testid':'home-details-summary-headline'}).get_text())
    except:
        address.append('')
    
    #bedrooms
    try:
        bedrooms.append(soup.find('li', {'data-testid':'bed'}).get_text())
    except:
        bedrooms.append('')
        
    #bathrooms
    try:
        bathrooms.append(soup.find('li', {'data-testid':'bath'}).get_text())
    except:
        bathrooms.append('')
        
    #sqft
    try:
        sqft.append(soup.find('li', {'data-testid':'floor'}).get_text())
    except:
        sqft.append('')
        
    #year built
    try:
        year_built.append(soup.find('div', string='Year Built').findNext('div').get_text())
    except:
        year_built.append('')
        
     #parking
    try:
        parking.append(soup.find('div', string='Parking').findNext('div').get_text())
    except:
        parking.append('')
        
     #pool
    try:
        pool.append(soup.find('div', string='Pool').findNext('div').get_text())
    except:
        pool.append('')
        
     #price
    try:
        price.append(soup.find('h3', {'data-testid':'on-market-price-details'}).get_text())
    except:
        price.append('')
        
    #creating a dictionary with the results
    output = {'Address': address, 'Bedrooms':bedrooms, 'Bathrooms': bathrooms, 'Area': sqft, 'Year Built': year_built,
             'Parking': parking, 'Pool': pool, 'Price': price}

In [31]:
output

{'Address': ['1907 Robinson Ave #108',
  '5250 Lenox Dr',
  '7720 Tyrolean Rd',
  '6124 Artisan Way',
  '12614 Darkwood Rd',
  '3078 Broadway #107',
  '12048 Rue Des Amis',
  '2287 Loring St #1',
  '4671 Hamilton St #4',
  '1951 47th St #146',
  '5170 Clairemont Mesa Blvd #51-18',
  '10676 Rancho Carmel Dr',
  '4864 49th St',
  '303 S 46th St',
  '4545 Collwood Blvd #20',
  '3228 44th St #5',
  '3007 Central Ave',
  '978 Granger St',
  '7620 Stalmer St #106',
  '2737 Lungos Ct',
  '2930 Naugatuck Ave',
  '4845 Castle Ave',
  '3872 Creststone Pl',
  '236 Payne St',
  '8190 Califa Ct',
  '12535 Alcacer Del Sol',
  '3135 Manos Dr',
  '328 W Lewis St',
  '821 Reef Dr',
  '3573 Chasewood Dr',
  '1656 Yost Dr',
  '2615 Nansen Ave',
  '11445 Spica Dr',
  '4120 Vivian St',
  '880 60th St',
  '5290 Timber Branch Way',
  '5407 Baja Dr',
  '4536 Arizona St #4',
  '8886 La Cartera St',
  '5850 Camino De La Costa'],
 'Area': ['862 sqft (on 0.51 acres)',
  '1,289 sqft',
  '1,227 sqft',
  '3,989 sqft

In [32]:
df = pd.DataFrame(output)

df

Unnamed: 0,Address,Bedrooms,Bathrooms,Area,Year Built,Parking,Pool,Price
0,1907 Robinson Ave #108,2 Beds,1 Bath,862 sqft (on 0.51 acres),1976,Garage,Pool,"$558,100"
1,5250 Lenox Dr,4 Beds,2 Baths,"1,289 sqft",1973,2 Car Garage,No,"$460,000"
2,7720 Tyrolean Rd,3 Beds,2 Baths,"1,227 sqft",1972,1 Car Garage,Yes,"$798,000"
3,6124 Artisan Way,5 Beds,5 Baths,"3,989 sqft",2018,2 Car Garage,No,"$2,295,000"
4,12614 Darkwood Rd,4 Beds,3 Baths,"2,287 sqft",1987,3 Car Garage,Yes,"$1,299,000"
5,3078 Broadway #107,2 Beds,2 Baths,827 sqft (on 0.74 acres),1993,Garage,No,"$469,000"
6,12048 Rue Des Amis,3 Beds,3 Baths,"1,622 sqft",1984,2 Car Garage,Yes,"$999,000"
7,2287 Loring St #1,3 Beds,3 Baths,"2,312 sqft (on 1.40 acres)",1989,2 Car Garage,,"$1,099,000"
8,4671 Hamilton St #4,2 Beds,2 Baths,739 sqft (on 0.30 acres),2000,2 Parking Spaces,Pool,"$499,000"
9,1951 47th St #146,2 Beds,2 Baths,"1,466 sqft (on 1.98 acres)",2014,No Info,,"$169,900"


## Scraping Multiple Pages - San Diego

In [34]:
address = []
bedrooms = []
bathrooms = []
sqft = []
year_built = []
parking = []
pool = []
price = []

# Url part I
url_1 = 'https://www.trulia.com'

for i in range(1, 26):
    
    #website
    www = 'https://www.trulia.com/CA/San_Diego/' +str(i) +'_p/'
    
    #requests
    response = requests.get(www)
    
    # soup object
    soup = BeautifulSoup(response.content, 'html.parser')
    
    #result container
    result_container = soup.find_all('li', {'class':'SearchResultsList__WideCell-b7y9ki-2'})
    
    #update results
    results_update = []
    
    # only results with attribute 'data-testeid'
    for r in result_container:
        if r.has_attr('data-testid'):
            results_update.append(r)
        
    #relative url
    relative_url = []
    
    #loop through results
    for i in results_update:
        for j in i.find_all('div', {'data-testid':'property-card-details'}):
            relative_url.append(j.find('a').get('href'))
            
    #empty list url_joined
    url_joined = []

    for i in relative_url:
        url_joined.append(urllib.parse.urljoin(url_1, i))
        
       
    # loop through all joined links
    for i in url_joined:
        response = requests.get(i)

        #create soup object
        soup = BeautifulSoup(response.content, 'html.parser')

        #address
        try:
            address.append(soup.find('span', {'data-testid':'home-details-summary-headline'}).get_text())
        except:
            address.append('')

        #bedrooms
        try:
            bedrooms.append(soup.find('li', {'data-testid':'bed'}).get_text())
        except:
            bedrooms.append('')

        #bathrooms
        try:
            bathrooms.append(soup.find('li', {'data-testid':'bath'}).get_text())
        except:
            bathrooms.append('')

        #sqft
        try:
            sqft.append(soup.find('li', {'data-testid':'floor'}).get_text())
        except:
            sqft.append('')

        #year built
        try:
            year_built.append(soup.find('div', string='Year Built').findNext('div').get_text())
        except:
            year_built.append('')

         #parking
        try:
            parking.append(soup.find('div', string='Parking').findNext('div').get_text())
        except:
            parking.append('')

         #pool
        try:
            pool.append(soup.find('div', string='Pool').findNext('div').get_text())
        except:
            pool.append('')

         #price
        try:
            price.append(soup.find('h3', {'data-testid':'on-market-price-details'}).get_text())
        except:
            price.append('')

        #creating a dictionary with the results
        output = {'Address': address, 'Bedrooms':bedrooms, 'Bathrooms': bathrooms, 'Area': sqft, 'Year Built': year_built,
                 'Parking': parking, 'Pool': pool, 'Price': price}

In [35]:
df = pd.DataFrame(output)
df

Unnamed: 0,Address,Bedrooms,Bathrooms,Area,Year Built,Parking,Pool,Price
0,1907 Robinson Ave #108,2 Beds,1 Bath,862 sqft (on 0.51 acres),1976,Garage,Pool,"$558,100"
1,5250 Lenox Dr,4 Beds,2 Baths,"1,289 sqft",1973,2 Car Garage,No,"$460,000"
2,7720 Tyrolean Rd,3 Beds,2 Baths,"1,227 sqft",1972,1 Car Garage,Yes,"$798,000"
3,6124 Artisan Way,5 Beds,5 Baths,"3,989 sqft",2018,2 Car Garage,No,"$2,295,000"
4,12614 Darkwood Rd,4 Beds,3 Baths,"2,287 sqft",1987,3 Car Garage,Yes,"$1,299,000"
...,...,...,...,...,...,...,...,...
995,804 Ensenada Ct,5 Beds,3 Baths,"1,900 sqft",1978,3 Open Spaces,No,"$2,399,999"
996,3035 Locust St,5 Beds,3 Baths,"2,640 sqft",1960,2 Car Garage,No,"$1,445,000"
997,18840 Caminito Cantilena #107,2 Beds,2 Baths,"1,056 sqft (on 2.97 acres)",1993,Garage,Pool,"$499,888"
998,4350 Mount Abernathy Ave,5 Beds,3 Baths,"2,080 sqft",1973,2 Car Garage,No,"$1,350,000"


In [36]:
df['Location'] = 'San Diego'
df

Unnamed: 0,Address,Bedrooms,Bathrooms,Area,Year Built,Parking,Pool,Price,Location
0,1907 Robinson Ave #108,2 Beds,1 Bath,862 sqft (on 0.51 acres),1976,Garage,Pool,"$558,100",San Diego
1,5250 Lenox Dr,4 Beds,2 Baths,"1,289 sqft",1973,2 Car Garage,No,"$460,000",San Diego
2,7720 Tyrolean Rd,3 Beds,2 Baths,"1,227 sqft",1972,1 Car Garage,Yes,"$798,000",San Diego
3,6124 Artisan Way,5 Beds,5 Baths,"3,989 sqft",2018,2 Car Garage,No,"$2,295,000",San Diego
4,12614 Darkwood Rd,4 Beds,3 Baths,"2,287 sqft",1987,3 Car Garage,Yes,"$1,299,000",San Diego
...,...,...,...,...,...,...,...,...,...
995,804 Ensenada Ct,5 Beds,3 Baths,"1,900 sqft",1978,3 Open Spaces,No,"$2,399,999",San Diego
996,3035 Locust St,5 Beds,3 Baths,"2,640 sqft",1960,2 Car Garage,No,"$1,445,000",San Diego
997,18840 Caminito Cantilena #107,2 Beds,2 Baths,"1,056 sqft (on 2.97 acres)",1993,Garage,Pool,"$499,888",San Diego
998,4350 Mount Abernathy Ave,5 Beds,3 Baths,"2,080 sqft",1973,2 Car Garage,No,"$1,350,000",San Diego


In [37]:
df.to_csv('df_san_diego.csv', index=False)