## Scraping Multiple Pages - Colorado

#### Importing Libraries

In [1]:
from bs4 import BeautifulSoup
import requests
import urllib.parse
import pandas as pd

#### Scraping the data

In [2]:
address = []
bedrooms = []
bathrooms = []
sqft = []
year_built = []
parking = []
pool = []
price = []

# Url part I
url_1 = 'https://www.trulia.com'

for i in range(1, 26):
    
    #website
    www = 'https://www.trulia.com/CO/Colorado_Springs/' +str(i) +'_p/'
    
    #requests
    response = requests.get(www)
    
    # soup object
    soup = BeautifulSoup(response.content, 'html.parser')
    
    #result container
    result_container = soup.find_all('li', {'class':'SearchResultsList__WideCell-b7y9ki-2'})
    
    #update results
    results_update = []
    
    # only results with attribute 'data-testeid'
    for r in result_container:
        if r.has_attr('data-testid'):
            results_update.append(r)
        
    #relative url
    relative_url = []
    
    #loop through results
    for i in results_update:
        for j in i.find_all('div', {'data-testid':'property-card-details'}):
            relative_url.append(j.find('a').get('href'))
            
    #empty list url_joined
    url_joined = []

    for i in relative_url:
        url_joined.append(urllib.parse.urljoin(url_1, i))
        
       
    # loop through all joined links
    for i in url_joined:
        response = requests.get(i)

        #create soup object
        soup = BeautifulSoup(response.content, 'html.parser')

        #address
        try:
            address.append(soup.find('span', {'data-testid':'home-details-summary-headline'}).get_text())
        except:
            address.append('')

        #bedrooms
        try:
            bedrooms.append(soup.find('li', {'data-testid':'bed'}).get_text())
        except:
            bedrooms.append('')

        #bathrooms
        try:
            bathrooms.append(soup.find('li', {'data-testid':'bath'}).get_text())
        except:
            bathrooms.append('')

        #sqft
        try:
            sqft.append(soup.find('li', {'data-testid':'floor'}).get_text())
        except:
            sqft.append('')

        #year built
        try:
            year_built.append(soup.find('div', string='Year Built').findNext('div').get_text())
        except:
            year_built.append('')

         #parking
        try:
            parking.append(soup.find('div', string='Parking').findNext('div').get_text())
        except:
            parking.append('')

         #pool
        try:
            pool.append(soup.find('div', string='Pool').findNext('div').get_text())
        except:
            pool.append('')

         #price
        try:
            price.append(soup.find('h3', {'data-testid':'on-market-price-details'}).get_text())
        except:
            price.append('')

        #creating a dictionary with the results
        output = {'Address': address, 'Bedrooms':bedrooms, 'Bathrooms': bathrooms, 'Area': sqft, 'Year Built': year_built,
                 'Parking': parking, 'Pool': pool, 'Price': price}

In [3]:
df = pd.DataFrame(output)
df

Unnamed: 0,Address,Bedrooms,Bathrooms,Area,Year Built,Parking,Pool,Price
0,6214 Meadowbank Ln,3 Beds,2 Baths,"1,471 sqft",2019,2 Car Garage,No,"$368,500"
1,3325 Bridgewater Dr,3 Beds,2 Baths,"1,401 sqft",1984,2 Car Garage,No,"$350,000"
2,7617 Firehawk Ln,3 Beds,3 Baths,"1,280 sqft",2004,2 Car Garage,No,"$335,000"
3,6842 Noble St,3 Beds,2 Baths,"1,253 sqft",1983,1 Car Garage,No,"$335,000"
4,2904 Templeton Gap Rd,4 Beds,2 Baths,"2,094 sqft",1954,1 Car Garage,No,"$385,000"
...,...,...,...,...,...,...,...,...
995,7247 Dutch Loop,5 Beds,4 Baths,"4,146 sqft",2016,4 Car Garage,No,"$625,000"
996,7927 Dutch Loop,4 Beds,3 Baths,"3,341 sqft",2015,Garage,No,"$485,000"
997,5570 Saxton Hollow Rd,5 Beds,6 Baths,"6,735 sqft (on 2.51 acres)",2015,4 Car Garage,No,"$2,250,000"
998,17580 Walden Way,5 Beds,4 Baths,"3,946 sqft (on 5.27 acres)",1999,2 Car Garage,No,"$1,300,000"


In [4]:
df['Location'] = 'Colorado'
df

Unnamed: 0,Address,Bedrooms,Bathrooms,Area,Year Built,Parking,Pool,Price,Location
0,6214 Meadowbank Ln,3 Beds,2 Baths,"1,471 sqft",2019,2 Car Garage,No,"$368,500",Colorado
1,3325 Bridgewater Dr,3 Beds,2 Baths,"1,401 sqft",1984,2 Car Garage,No,"$350,000",Colorado
2,7617 Firehawk Ln,3 Beds,3 Baths,"1,280 sqft",2004,2 Car Garage,No,"$335,000",Colorado
3,6842 Noble St,3 Beds,2 Baths,"1,253 sqft",1983,1 Car Garage,No,"$335,000",Colorado
4,2904 Templeton Gap Rd,4 Beds,2 Baths,"2,094 sqft",1954,1 Car Garage,No,"$385,000",Colorado
...,...,...,...,...,...,...,...,...,...
995,7247 Dutch Loop,5 Beds,4 Baths,"4,146 sqft",2016,4 Car Garage,No,"$625,000",Colorado
996,7927 Dutch Loop,4 Beds,3 Baths,"3,341 sqft",2015,Garage,No,"$485,000",Colorado
997,5570 Saxton Hollow Rd,5 Beds,6 Baths,"6,735 sqft (on 2.51 acres)",2015,4 Car Garage,No,"$2,250,000",Colorado
998,17580 Walden Way,5 Beds,4 Baths,"3,946 sqft (on 5.27 acres)",1999,2 Car Garage,No,"$1,300,000",Colorado


In [5]:
df.to_csv('df_colorado.csv', index=False)