# Daft Scraping

This is a python script to scrape Daft.ie for all house prices since the time of the ast scrape. The unique identifier is the URL which contains reference to the apartment/house. 

In [1]:
from bs4 import BeautifulSoup, Comment
from urllib2 import urlopen
import pprint as pp
import html5lib 
import sys
#import mysql.connector as sql

In [2]:
BASE_URL = 'http://www.daft.ie/ireland/rooms-to-share'
#BASE_URL_2 = 'http://www.daft.ie'

### Scrape Content

This section of code loops through each page of the apartment share search results and extracts the HTML for each apartment. Once extracted, features are constructed and the result is stored in a MySQL database for analysis

In [22]:
soup = make_soup(BASE_URL)
    
for idx, ad_section in enumerate(soup.find_all('div', 'box')):
    
    try:
        details = []
        link = str(BASE_URL) + str(ad_section.find('a').get('href'))
        
        print idx, link
        ad = make_soup(link)
        
        # Now find the features of the individual property
        features = get_property_features(ad)
        print features

        # Check if already in DB - Insert the apartment information into a Database(one row per apart)
        #insert_features( prop_features )
        
        # Now offset the page number by 10 and create new BASE_URL
        
    except:
        #print ad
        #break
        pass
    #break

0 http://www.daft.ie/ireland/rooms-to-share/sligo/house-share/sligo/76-cartron-bay-sligo-sligo-830596/
[{'views': 1231, 'price': '220', 'address': '76 Cartron Bay, Sligo, Co. Sligo', 'couples': None, 'facilities': 2, 'total_residents': None, 'picture_count': 9, 'property_type': 'House', 'availability': u' Immediately', 'date_entered': u'16/8/2015 (today, 50 minutes ago)', 'lease_len': u' 9 months or more', 'average_cost': 178.75, 'owner_occ': None, 'bedroom_type': None}]
1 http://www.daft.ie/ireland/rooms-to-share/dublin/house-share/ranelagh/ranelagh-ave-ranelagh-dublin-837918/
[{'views': 75, 'price': '600', 'address': 'Ranelagh ave, Ranelagh, Dublin 6', 'couples': 0, 'facilities': 5, 'total_residents': None, 'picture_count': 1, 'property_type': 'House', 'availability': u' Tuesday 1st September', 'date_entered': u'16/8/2015 (today, 1 hour 56 minutes ago)', 'lease_len': u' 9 months or more', 'average_cost': 616.6666666666666, 'owner_occ': 1, 'bedroom_type': 'Twin'}]
2 http://www.daft.ie

This method takes a HTML page as a parameter and parses out relevant features of the property ad. It returns a dataframe of features to the caller function which in turn inserts it into a MySQL database.

In [21]:
def get_property_features(HTML):

    # General property overview
    general = HTML.find('div', 'smi-object-info')

    address = str(general.find('h1').get_text())
    
    price = str(price_of_property(general.find('div', {'id':'smi-price-string'}).get_text()))
    property_type = str(general.find('span', 'header_text').get_text().split(' ')[0])   
        
    content = HTML.find('div', {'id':'smi-content'})
    
    # First get the number of photos taken of the property
    picture_count = get_picture_count(content)
    
    # Next parse the main description area including the plain block of text.
    property_overview = content.find('div', {'id':'overview'})
    
    # returns information about the rental from the property overview area
    basic_info = property_overview.find_all('li')
    bedroom, total_residents, owner_occ, couples = get_basic_info(basic_info, len(basic_info))
    
    # Availability of room(when it's ready)
    availability = content.find('h3', 'left_title').next_sibling
    
    # Length of lease
    length_content = content.find('h3', 'left_title').find_next_siblings()
    length_lease = length_content[0].find('h3', 'left_title').next_sibling
    
    # Number of facilities available
    fac_count = get_facility_count(content.find('table', {'id':'facilities'}))
    
    # returns the date that the rental was entered/renewed
    date = get_date_entered(content)
    
    views = get_property_views(content)
    
    # The average cost of rentals in the same location
    average_cost = get_avg_cost_vicinity(HTML)
    
    return[{'address': address,'price': price, 'property_type': property_type, 
            'picture_count' : picture_count, 'bedroom_type' : bedroom, 
            'couples' : couples, 'total_residents' : total_residents, 'owner_occ' :owner_occ,
            'availability' :availability, 'lease_len' :length_lease, 
            'facilities' :fac_count, 'date_entered' : date, 'views' : views, 'average_cost' : average_cost }]
    

# For testing purposes
get_property_features(make_soup('http://www.daft.ie/ireland/rooms-to-share/sligo/house-share/sligo/76-cartron-bay-sligo-sligo-830596/'))

[{'address': '76 Cartron Bay, Sligo, Co. Sligo',
  'availability': u' Immediately',
  'average_cost': 63.125,
  'bedroom_type': None,
  'couples': None,
  'date_entered': u'16/8/2015 (today, 49 minutes ago)',
  'facilities': 2,
  'lease_len': u' 9 months or more',
  'owner_occ': None,
  'picture_count': 9,
  'price': '220',
  'property_type': 'House',
  'total_residents': None,
  'views': 1231}]

Utility functions:

In [16]:
def make_soup(url):
    html = urlopen(url).read()
    return BeautifulSoup(html, 'html.parser')

def get_basic_info(content, rows):
    
    if rows == 4:
        for idx, row in enumerate(content):
            if (idx == 0):
            # Look for the bedroom type
                bedroom = bedroom_type(row.get_text())  
            elif (idx == 1):                
                # Total residents - Including spare room
                total_residents = return_digit_from_string(row.get_text()) + 1
            elif (idx == 2):
                owner_occ = 0 if 'not' in row.get_text() else 1
            elif (idx == 3):
                # Whether couples are allowed
                couples = 0 if 'Couples not accepted' in row.get_text() else 1
    elif rows == 3:
        total_residents = None
        for idx, row in enumerate(content):
            if (idx == 0):
                # Look for the bedroom type
                bedroom = bedroom_type(row.get_text())  
            elif (idx == 1):
                # Whether the apartment is owner occupied
                owner_occ = 0 if 'not' in row.get_text() else 1
            elif (idx == 2):
                # Whether couples are allowed
                couples = 0 if 'Couples not accepted' in row.get_text() else 1
    else:
        return None, None, None, None
    return bedroom, total_residents, owner_occ, couples

# Method to determine the room type available
def bedroom_type(bedroom):
    bedroom_type = ''
    if 'Single' in bedroom:
        return 'Single'
    elif 'Double' in bedroom:
        return 'Double'
    elif 'Twin' in bedroom:
        return 'Twin'
    
    
# Method to return the price of a property(Small assumption made :S) - Improve this to determine proper period
def price_of_property(string):
    if 'weekly' in string:
        return  return_digit_from_string(string) * 4
    else:
        return return_digit_from_string(string)

    
# Method to return the first digit of the string
def return_digit_from_string(string):
    digit = ''.join([x for x in string if x.isdigit()])
    return int(digit)


def get_date_entered(content):

    entered_date = content.find('table', {'id':'facilities'}).next_sibling.next_sibling.next_sibling.next_sibling
    return entered_date


def get_property_views(content):

    views = content.find('table', {'id':'facilities'}).next_sibling.next_sibling.next_sibling.next_sibling.next_sibling.next_sibling
    return int(views.replace(",", ""))
    
    
def get_facility_count(content):
    
    count = 0
    for col in content.find_all('td'):
        for row in col.find_all('li'):
            count+=1
    return count


# This method calculates the average cost of the apartments within the same vicinity
def get_avg_cost_vicinity(HTML):
    prices = []
    frame = HTML.find('div', {'id':'smi-tab-other-properties'})
    
    for idx, pane in enumerate(frame.find_all('li')):

            cost_string = pane.find('strong').get_text().strip()
            if ('Monthly' in cost_string or 'Weekly' in cost_string):
                prices.append(return_digit_from_string(cost_string))
                #print cost_string, return_digit_from_string(cost_string)
                
    average_cost = sum(prices) / float(len(prices))
    return average_cost

def get_picture_count(content):
    try:
        return int(content.find('a', 'p1').get_text()[:2])
    except:
        return 0

### SQL Inserts

This method is used to insert data into a MySQL database for persistance. In addition to that, we store the raw HTML in a NoSQL database for additional flexibility.

In [12]:
def insert_features(property_details):

    db_params = {'user' : "root",
                 'password' : "",
                 'host' : "localhost",
                 'port' : 3306,
                 'database' : "daft",
                 'charset' : 'utf8',
                 'collation' : 'utf8_general_ci',
                 'buffered' : True
                }
    conn = sql.connect(**db_params)
    cursor = conn.cursor()
    
    
    query = """
            INSERT IGNORE 
            INTO property (address, price, property_type, 
            picture_count, bedroom_type, couples, total_residents, owner_occupied, link, html) 
            VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
            """

    try:
        cursor.execute(query, property_details)
        conn.commit()
    except sql.Error as err:
        print err
        pass
    cursor.close()

In [9]:
def css_styling():
    from IPython.display import HTML
    styles = open('./styling/custom.css', 'r').read()
    return HTML(styles) 

css_styling()