Fork of code from ScraperWiki at https://classic.scraperwiki.com/scra…

…pers/rightmove/
pau1mi11er · Aug 18, 2017 · 8ef8c60 · 8ef8c60
commit 8ef8c60
Show file tree

Hide file tree

Showing 2 changed files with 155 additions and 0 deletions.
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,2 @@
+# Ignore output of scraper
+data.sqlite
diff --git a/scraper.py b/scraper.py
@@ -0,0 +1,153 @@
+# Original scraper: stations found by looking at maps
+# Can include changes etc
+# Up to approximately 2h15
+from lxml import etree
+from lxml.etree import tostring
+from datetime import datetime
+import scraperwiki
+import StringIO
+
+# Parameters for the scraper: stations, radius etc.
+stations = [ {'Nantwich':6473}, {'Chester':2024},
+{'Acton Bridge':68},{'Hartford':4295},
+{'Winsford':10172},{'Macclesfield':5930},
+{'Prestbury':7421},{'Holmes Chapel':4673},
+{'Sandbach':7946},{'Crewe':2423},
+{'Congleton':2288},{'Alsager':185},
+{'Nantwich':6473},{'Wrenbury':10331},
+{'Whitchurch (Salop)':10004},{'Kidsgrove':5114},
+{'Stoke-on-Trent':8771},{'Stone':8777},
+{'Ambergate':209},{'Willington':10112},
+{'Burton-on-Trent':1613},{'Stafford':8660},
+{'Rugeley Trent Valley':7856},{'Codsall':2231},
+{'Albrighton':125},{'Cosford':2357},
+{'Stroud':8858},{'Stonehouse':8795},
+{'Pershore':7184},{'Kingham':5156},
+{'Moreton-in-Marsh':6371}, {'Westbury':9920},
+{'Frome':3641}, {'Taunton':9056},
+{'Tiverton':9218}, {'Hamworthy':4211},
+{'Brockenhurst':1418}, {'Ashurst New Forest':389},
+{'Worcester': 10298}, {'Pershore': 7184},
+{'Hagley': 4109}, {'Evesham': 3323},
+{'Honeybourne': 4697}, {'Shipton': 8207},
+{'Ascott-under-Wychwood Station': 350}, {'Charlbury': 1952},
+{'Finstock': 3521},{'Combe': 2282},
+{'Bradford-on-avon': 1265}, {'Bedwyn': 788},
+{'Pewsey': 7211}, {'Westbury': 9920},
+{'Frome': 3641}, 
+{'Castle Cary': 1853},
+{'Tisbury': 9215}, {'Gillingham': 3755},
+{'Templecombe': 9080}, {'Salisbury': 7922},
+{'Warminster': 9620}, {'Dilton Marsh Rail': 2771},
+{'Avoncliff': 449}, {'Freshford': 3623},
+{'Melksham': 6140}, {'Yate': 10373},
+{'Cam & Dursley': 1691}, {'Kemble': 5009},
+{'Market Harborough': 6050}, {'Kettering': 5087},
+{'Corby': 15013}, {'Wellingborough': 9743},
+{'Banbury': 545}, {'Bicester North': 929},
+{'Princes Risborough': 7454}, {'Bicester Town': 932},
+{'Aylesbury': 458}, {'Amersham': 215},
+{'Long Buckby': 5816}, {'Theale': 9125},
+{'Aldermaston': 131}, {'Midgham': 6209},
+{'Thatcham': 9095}, {'Newbury': 6599},
+{'Kintbury': 5222}, {'Leamington Spa': 5444},
+{'Newbury Racecourse':  6596}, {'Newington': 6623}, 
+{'Hollingbourne': 4661}, {'Chilham': 2048}, 
+{'Wye': 10346}, {'Ham street': 4157}, 
+{'Appledore': 266}, {'Estchingham': 3305},
+{'Chippenham': 2069} ]
+
+MIN_PRICE = 200000
+MAX_PRICE = 310000
+MIN_BEDROOMS = 2
+RADIUS_MILES = 3.0
+stop_phrases = [ "views over the garden",
+"views over the rear garden", "views over the front garden",
+"views over rear garden", "views over front garden",
+"views across the gardens", "views onto the garden",
+"in need of updating", "in need of modernisation",
+"views over rear aspect", "views over front aspect",
+"views over the rear aspect", "views over the front aspect",
+"views over side aspect", "views over the side aspect",
+"1970s",  "bungalow", "bunaglow",
+"views to the front garden", "views to the rear garden" ]
+# "semi detached", "semi-detached", "semidetached",
+DOMAIN = 'http://www.rightmove.co.uk' 
+
+def scrape_individual_house(house_url, town):
+    HOUSE_URL = (DOMAIN + house_url).split('/svr/')[0]
+    #print 'Scraping %s' % HOUSE_URL
+    house_html = scraperwiki.scrape(HOUSE_URL)
+    house_parser = etree.HTMLParser()
+    house_tree = etree.parse(StringIO.StringIO(house_html), house_parser)
+    house_text = house_tree.xpath('string(//div[@class="propertyDetailDescription"])')
+    # Only look at houses with the word 'views' in the ad text. 
+    if 'views' in house_text.lower() or 'elevated position' in house_text.lower():
+        house = {}
+        stopped_phrase = None
+        # Check for stop phrases
+        title = house_tree.xpath('string(//h1[@id="propertytype"])')
+        for sp in stop_phrases:
+            if (sp in house_text.lower()):
+                #print 'Ignoring %s because of stop phrase: %s' % (HOUSE_URL, sp)
+                stopped_phrase = sp
+            if (sp in title.lower()):
+                stopped_phrase = sp
+        #if not any(d.get('link') == HOUSE_URL for d in house_items):
+        image_url = tostring(house_tree.xpath('//img[@id="mainphoto"]')[0])
+        price = house_tree.xpath('string(//div[@id="amount"])')
+        nearby_stations = house_tree.xpath('string(//div[@id="nearbystations"]/div)')
+        ns = nearby_stations.split("(")
+        distance = ns[-1].replace(")","")
+        distance = ' '.join(distance.split()).strip()
+        if float(distance.replace(" miles",""))>1.5:
+            return False
+        map_img = house_tree.xpath('//a[@id="minimapwrapper"]/img')
+        if map_img:
+            map_img = tostring(house_tree.xpath('//a[@id="minimapwrapper"]/img')[0])
+        else:
+            map_img = ''
+        house['title'] = "%s - %s, %s, %s from station" % (title, town, price, distance)
+        #print 'HOUSE FOUND! %s, %s ' % (house['title'], HOUSE_URL)
+        item_text = '<a href="' + HOUSE_URL + '">' + image_url + '</a>'
+        #item_text += '<div style="position:relative;">'
+        item_text += '<a href="' + HOUSE_URL + '">' + map_img + '</a>'
+        #item_text += '<img id="googlemapicon" src="http://www.rightmove.co.uk/ps/images11074/maps/icons/rmpin.png"'
+        #item_text += ' style="position:absolute;top:100px;left:100px;alt="Property location" /></div>'
+        item_text += house_text
+        item_text = item_text.replace("views","<span style='font-weight:bold;color:red;'>views</span>")
+        house['description'] = item_text.replace("fireplace","<span style='font-weight:bold;color:red;'>fireplace</span>")
+        if stopped_phrase:
+            house['stop'] = stopped_phrase
+        else:
+            house['stop'] = ''
+        house['link'] = HOUSE_URL
+        house['pubDate'] = datetime.now()
+        scraperwiki.sqlite.save(['link'], house)
+
+# Gather list of results for an individual station. 
+def scrape_results_page(results_url, town, initial=False):
+    results_url = DOMAIN + results_url
+    html = scraperwiki.scrape(results_url)
+    parser = etree.HTMLParser()
+    tree = etree.parse(StringIO.StringIO(html), parser)
+    house_links = tree.xpath('//ol[@id="summaries"]//a[starts-with(text(), "More details")]/@href')
+    for house_link in house_links:
+        scrape_individual_house(house_link, town)
+    if initial:
+        results_links = tree.xpath('//ul[@class="items"]//a/@href')
+        for r in results_links:
+            scrape_results_page(r, town)
+
+scrape_individual_house('/property-for-sale/property-33036143.html/svr/3113','Castle Cary')
+
+# Go through each station: scrape each set of results in turn. 
+for station in stations:
+    station_name = station.keys()[0].title()
+    print 'Scraping %s' % station_name
+    station_id = station.values()[0]
+    url1 = '/property-for-sale/find.html?locationIdentifier=STATION^%s&minPrice=%s&maxPrice=%s' % (station_id, MIN_PRICE, MAX_PRICE)
+    url2 = '&minBedrooms=%s&displayPropertyType=houses&oldDisplayPropertyType=houses&radius=%s' % (MIN_BEDROOMS, RADIUS_MILES)
+    # displayPropertyType=detachedshouses
+    INITIAL_URL = url1 + url2
+    scrape_results_page(INITIAL_URL, town=station_name, initial=True)