# Getting started #

1. You'll first of all need to install scrapy:

````
pip install scrapy
````

2. Change the `start_urls` to match the pages you are scraping

3. Change the `job_title_selector` and `person_bio_link_selector` to select the required items from the page

4. Run the notebook

5. Use the output to write to file and parse the profiles for keywords

In [1]:
import scrapy

job_title_selector = 'div.name p strong::text'
person_bio_link_selector = 'div.name p a::attr(href)'

class RSESpider(scrapy.Spider):
    name = 'staff profile spider'
    start_urls = [
        "http://www.ncl.ac.uk/apl/staff/",
        "http://www.ncl.ac.uk/sacs/staff/",
        "http://www.ncl.ac.uk/nubs/staff/"
    ]

    def parse(self, response):
        for person in response.css('div.staff'):
            yield {'title': person.css(job_title_selector).extract_first(), 'link': person.css(person_bio_link_selector).extract_first()}

#         for next_page in response.css('div.prev-post > a'):
#             yield response.follow(next_page, self.parse)

In [2]:
from scrapy.crawler import CrawlerProcess
from scrapy.utils.project import get_project_settings

process = CrawlerProcess(get_project_settings())

process.crawl(RSESpider())
process.start() # the script will block here until the crawling is finished

2018-03-20 16:01:43 [scrapy.utils.log] INFO: Scrapy 1.5.0 started (bot: scrapybot)
2018-03-20 16:01:43 [scrapy.utils.log] INFO: Versions: lxml 4.2.0.0, libxml2 2.9.8, cssselect 1.0.3, parsel 1.4.0, w3lib 1.19.0, Twisted 17.9.0, Python 3.6.4 | packaged by conda-forge | (default, Dec 23 2017, 16:31:06) - [GCC 4.8.2 20140120 (Red Hat 4.8.2-15)], pyOpenSSL 17.4.0 (OpenSSL 1.0.2n  7 Dec 2017), cryptography 2.1.4, Platform Linux-4.9.60-linuxkit-aufs-x86_64-with-debian-stretch-sid
2018-03-20 16:01:43 [scrapy.crawler] INFO: Overridden settings: {}
2018-03-20 16:01:43 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
 'scrapy.extensions.telnet.TelnetConsole',
 'scrapy.extensions.memusage.MemoryUsage',
 'scrapy.extensions.logstats.LogStats']
2018-03-20 16:01:43 [scrapy.middleware] INFO: Enabled downloader middlewares:
['scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware',
 'scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware',
 'scra

2018-03-20 16:01:43 [scrapy.core.scraper] DEBUG: Scraped from <200 http://www.ncl.ac.uk/sacs/staff/>
{'title': 'Clerical Assistant - Culture Lab', 'link': '/sacs/staff/profile/elizabethbradley.html'}
2018-03-20 16:01:43 [scrapy.core.scraper] DEBUG: Scraped from <200 http://www.ncl.ac.uk/business-school/staff/>
{'title': 'Senior Lecturer in Management', 'link': '/business-school/staff/profile/sueabbott.html'}
2018-03-20 16:01:43 [scrapy.core.scraper] DEBUG: Scraped from <200 http://www.ncl.ac.uk/apl/staff/>
{'title': 'Professor of Ageing, Policy & Planning', 'link': '/apl/staff/profile/rcgilroy.html'}
2018-03-20 16:01:43 [scrapy.core.scraper] DEBUG: Scraped from <200 http://www.ncl.ac.uk/apl/staff/>
{'title': 'Professor of Cities and Society', 'link': '/apl/staff/profile/stevegraham.html'}
2018-03-20 16:01:43 [scrapy.core.scraper] DEBUG: Scraped from <200 http://www.ncl.ac.uk/apl/staff/>
{'title': 'Emeritus Professor of Town & Country Planning', 'link': '/apl/staff/profile/patsyhealey.h

2018-03-20 16:01:43 [scrapy.core.scraper] DEBUG: Scraped from <200 http://www.ncl.ac.uk/business-school/staff/>
{'title': 'Programme Secretary', 'link': '/business-school/staff/profile/ellenarkless.html'}
2018-03-20 16:01:43 [scrapy.core.scraper] DEBUG: Scraped from <200 http://www.ncl.ac.uk/business-school/staff/>
{'title': None, 'link': '/business-school/staff/profile/sianarmstrong.html'}
2018-03-20 16:01:43 [scrapy.core.scraper] DEBUG: Scraped from <200 http://www.ncl.ac.uk/apl/staff/>
{'title': 'Professor of Experimental Architecture', 'link': '/apl/staff/profile/rachelarmstrong3.html'}
2018-03-20 16:01:43 [scrapy.core.scraper] DEBUG: Scraped from <200 http://www.ncl.ac.uk/apl/staff/>
{'title': 'Degree Programme Director BA (Hons) Architecture ', 'link': '/apl/staff/profile/samuelaustin.html'}
2018-03-20 16:01:43 [scrapy.core.scraper] DEBUG: Scraped from <200 http://www.ncl.ac.uk/apl/staff/>
{'title': 'Teaching Fellow in Architecture', 'link': '/apl/staff/profile/elizabethbaldwingr

2018-03-20 16:01:43 [scrapy.core.scraper] DEBUG: Scraped from <200 http://www.ncl.ac.uk/apl/staff/>
{'title': 'Reader in Design Computation, Co-Director of ARC (Architectural Research Collaborative) and Director MSc In Experimental Architecture ', 'link': '/apl/staff/profile/martyndade-robertson.html'}
2018-03-20 16:01:43 [scrapy.core.scraper] DEBUG: Scraped from <200 http://www.ncl.ac.uk/apl/staff/>
{'title': 'Professor of Environmental Policy & Planning, Director of GURU', 'link': '/apl/staff/profile/simindavoudi.html'}
2018-03-20 16:01:43 [scrapy.core.scraper] DEBUG: Scraped from <200 http://www.ncl.ac.uk/sacs/staff/>
{'title': 'Lecturer In Multimedia Journalism', 'link': '/sacs/staff/profile/murraydick.html'}
2018-03-20 16:01:43 [scrapy.core.scraper] DEBUG: Scraped from <200 http://www.ncl.ac.uk/sacs/staff/>
{'title': 'School Research Coordinator', 'link': '/sacs/staff/profile/kerrydodds.html'}
2018-03-20 16:01:43 [scrapy.core.scraper] DEBUG: Scraped from <200 http://www.ncl.ac.uk/

2018-03-20 16:01:43 [scrapy.core.scraper] DEBUG: Scraped from <200 http://www.ncl.ac.uk/apl/staff/>
{'title': 'Teaching Fellow', 'link': '/apl/staff/profile/claireharper.html'}
2018-03-20 16:01:43 [scrapy.core.scraper] DEBUG: Scraped from <200 http://www.ncl.ac.uk/apl/staff/>
{'title': 'Teaching Fellow in Architecture', 'link': '/apl/staff/profile/lauraharty.html'}
2018-03-20 16:01:43 [scrapy.core.scraper] DEBUG: Scraped from <200 http://www.ncl.ac.uk/apl/staff/>
{'title': 'Emeritus Professor of Town & Country Planning', 'link': '/apl/staff/profile/patsyhealey.html'}
2018-03-20 16:01:43 [scrapy.core.scraper] DEBUG: Scraped from <200 http://www.ncl.ac.uk/apl/staff/>
{'title': 'Lecturer in Architecture', 'link': '/apl/staff/profile/christoskakalis.html'}
2018-03-20 16:01:43 [scrapy.core.scraper] DEBUG: Scraped from <200 http://www.ncl.ac.uk/sacs/staff/>
{'title': 'Lecturer in Fine Art', 'link': '/sacs/staff/profile/nickfox.html'}
2018-03-20 16:01:43 [scrapy.core.scraper] DEBUG: Scraped f

2018-03-20 16:01:44 [scrapy.core.scraper] DEBUG: Scraped from <200 http://www.ncl.ac.uk/apl/staff/>
{'title': 'Senior Lecturer - DPD MSc Town Planning & MSc Planning for Sustainability and Climate Change', 'link': '/apl/staff/profile/janemidgley.html'}
2018-03-20 16:01:44 [scrapy.core.scraper] DEBUG: Scraped from <200 http://www.ncl.ac.uk/apl/staff/>
{'title': 'Senior Lecturer in Architecture', 'link': '/apl/staff/profile/julietodgers.html'}
2018-03-20 16:01:44 [scrapy.core.scraper] DEBUG: Scraped from <200 http://www.ncl.ac.uk/apl/staff/>
{'title': 'Lecturer in Architecture', 'link': '/apl/staff/profile/matthewozga-lawn.html'}
2018-03-20 16:01:44 [scrapy.core.scraper] DEBUG: Scraped from <200 http://www.ncl.ac.uk/apl/staff/>
{'title': 'Degree Programme Director, MArch', 'link': '/apl/staff/profile/stephenparnell.html'}
2018-03-20 16:01:44 [scrapy.core.scraper] DEBUG: Scraped from <200 http://www.ncl.ac.uk/apl/staff/>
{'title': 'Professor of Urban Conservation', 'link': '/apl/staff/pro

2018-03-20 16:01:44 [scrapy.core.scraper] DEBUG: Scraped from <200 http://www.ncl.ac.uk/business-school/staff/>
{'title': 'International Recruitment Manager', 'link': '/business-school/staff/profile/thomasday.html'}
2018-03-20 16:01:44 [scrapy.core.scraper] DEBUG: Scraped from <200 http://www.ncl.ac.uk/apl/staff/>
{'title': 'Lecturer in Planning and Urbanism', 'link': '/apl/staff/profile/georgianavarna.html'}
2018-03-20 16:01:44 [scrapy.core.scraper] DEBUG: Scraped from <200 http://www.ncl.ac.uk/apl/staff/>
{'title': 'Professor of Urban Planning and Director of Research', 'link': '/apl/staff/profile/geoffvigar.html'}
2018-03-20 16:01:44 [scrapy.core.scraper] DEBUG: Scraped from <200 http://www.ncl.ac.uk/apl/staff/>
{'title': 'Lecturer in Architecture', 'link': '/apl/staff/profile/edwardwainwright.html'}
2018-03-20 16:01:44 [scrapy.core.scraper] DEBUG: Scraped from <200 http://www.ncl.ac.uk/apl/staff/>
{'title': 'Lecturer in Town Planning', 'link': '/apl/staff/profile/davidwebb.html'}
2

2018-03-20 16:01:44 [scrapy.core.scraper] DEBUG: Scraped from <200 http://www.ncl.ac.uk/apl/staff/>
{'title': 'Postdoctoral Fellow', 'link': '/apl/staff/profile/pollygould.html'}
2018-03-20 16:01:44 [scrapy.core.scraper] DEBUG: Scraped from <200 http://www.ncl.ac.uk/apl/staff/>
{'title': 'Research Associate in Participatory Design', 'link': '/apl/staff/profile/saraheitlinger.html'}
2018-03-20 16:01:44 [scrapy.core.scraper] DEBUG: Scraped from <200 http://www.ncl.ac.uk/apl/staff/>
{'title': None, 'link': '/apl/staff/profile/luishernandez-hernandez.html'}
2018-03-20 16:01:44 [scrapy.core.scraper] DEBUG: Scraped from <200 http://www.ncl.ac.uk/sacs/staff/>
{'title': 'Deputy Head of Media, Culture, Heritage', 'link': '/sacs/staff/profile/katymcdonald.html'}
2018-03-20 16:01:44 [scrapy.core.scraper] DEBUG: Scraped from <200 http://www.ncl.ac.uk/sacs/staff/>
{'title': 'Associate Dean for Research & Innovation', 'link': '/sacs/staff/profile/simonmckerrell.html'}
2018-03-20 16:01:44 [scrapy.cor

2018-03-20 16:01:44 [scrapy.core.scraper] DEBUG: Scraped from <200 http://www.ncl.ac.uk/apl/staff/>
{'title': 'Operations Assistant', 'link': '/apl/staff/profile/pamcovell.html'}
2018-03-20 16:01:44 [scrapy.core.scraper] DEBUG: Scraped from <200 http://www.ncl.ac.uk/apl/staff/>
{'title': 'Learning & Teaching Assistant (MArch & Admissions)', 'link': '/apl/staff/profile/eileendonnelly.html'}
2018-03-20 16:01:44 [scrapy.core.scraper] DEBUG: Scraped from <200 http://www.ncl.ac.uk/apl/staff/>
{'title': None, 'link': '/apl/staff/profile/beckyguthrie.html'}
2018-03-20 16:01:44 [scrapy.core.scraper] DEBUG: Scraped from <200 http://www.ncl.ac.uk/apl/staff/>
{'title': 'Computing Officer', 'link': '/apl/staff/profile/markhalpin.html'}
2018-03-20 16:01:44 [scrapy.core.scraper] DEBUG: Scraped from <200 http://www.ncl.ac.uk/apl/staff/>
{'title': 'Technician', 'link': '/apl/staff/profile/nathanhudson.html'}
2018-03-20 16:01:44 [scrapy.core.scraper] DEBUG: Scraped from <200 http://www.ncl.ac.uk/sacs/s

2018-03-20 16:01:44 [scrapy.core.scraper] DEBUG: Scraped from <200 http://www.ncl.ac.uk/business-school/staff/>
{'title': None, 'link': '/business-school/staff/profile/benjamingolant.html'}
2018-03-20 16:01:44 [scrapy.core.scraper] DEBUG: Scraped from <200 http://www.ncl.ac.uk/business-school/staff/>
{'title': 'Professor in Marketing', 'link': '/business-school/staff/profile/matthewgorton.html'}
2018-03-20 16:01:44 [scrapy.core.scraper] DEBUG: Scraped from <200 http://www.ncl.ac.uk/apl/staff/>
{'title': 'Learning and Teaching Co-Ordinator (AS)', 'link': '/apl/staff/profile/elizabethnoble.html'}
2018-03-20 16:01:44 [scrapy.core.scraper] DEBUG: Scraped from <200 http://www.ncl.ac.uk/apl/staff/>
{'title': 'Events and Engagement Manager', 'link': '/apl/staff/profile/alisonpattison.html'}
2018-03-20 16:01:44 [scrapy.core.scraper] DEBUG: Scraped from <200 http://www.ncl.ac.uk/apl/staff/>
{'title': 'Receptionist/teaching office support', 'link': '/apl/staff/profile/lorraineproudlock.html'}
20

2018-03-20 16:01:44 [scrapy.core.scraper] DEBUG: Scraped from <200 http://www.ncl.ac.uk/apl/staff/>
{'title': 'Visiting Professor', 'link': '/apl/staff/profile/gertde-roo.html'}
2018-03-20 16:01:44 [scrapy.core.scraper] DEBUG: Scraped from <200 http://www.ncl.ac.uk/apl/staff/>
{'title': 'Visiting Professor', 'link': '/apl/staff/profile/sir-terryfarrell.html'}
2018-03-20 16:01:44 [scrapy.core.scraper] DEBUG: Scraped from <200 http://www.ncl.ac.uk/apl/staff/>
{'title': 'Visiting Professor', 'link': '/apl/staff/profile/katjagrillner.html'}
2018-03-20 16:01:44 [scrapy.core.scraper] DEBUG: Scraped from <200 http://www.ncl.ac.uk/apl/staff/>
{'title': 'Visiting Professor', 'link': '/apl/staff/profile/annaminton.html'}
2018-03-20 16:01:44 [scrapy.core.scraper] DEBUG: Scraped from <200 http://www.ncl.ac.uk/apl/staff/>
{'title': 'Visiting Professor', 'link': '/apl/staff/profile/michaeltawa.html'}
2018-03-20 16:01:44 [scrapy.core.scraper] DEBUG: Scraped from <200 http://www.ncl.ac.uk/sacs/staff/>

2018-03-20 16:01:44 [scrapy.core.scraper] DEBUG: Scraped from <200 http://www.ncl.ac.uk/apl/staff/>
{'title': 'Visiting Fellow', 'link': '/apl/staff/profile/stephenlockley.html'}
2018-03-20 16:01:44 [scrapy.core.scraper] DEBUG: Scraped from <200 http://www.ncl.ac.uk/apl/staff/>
{'title': 'Academic Visitor', 'link': '/apl/staff/profile/zhongzhenglyu.html'}
2018-03-20 16:01:44 [scrapy.core.scraper] DEBUG: Scraped from <200 http://www.ncl.ac.uk/apl/staff/>
{'title': 'Visiting Fellow', 'link': '/apl/staff/profile/chrispoulton.html'}
2018-03-20 16:01:44 [scrapy.core.scraper] DEBUG: Scraped from <200 http://www.ncl.ac.uk/apl/staff/>
{'title': 'Visiting Fellow', 'link': '/apl/staff/profile/grahamtipple.html'}
2018-03-20 16:01:44 [scrapy.core.scraper] DEBUG: Scraped from <200 http://www.ncl.ac.uk/business-school/staff/>
{'title': 'Emeritus Professor', 'link': '/business-school/staff/profile/michaeljones-lee.html'}
2018-03-20 16:01:44 [scrapy.core.scraper] DEBUG: Scraped from <200 http://www.nc

2018-03-20 16:01:44 [scrapy.core.scraper] DEBUG: Scraped from <200 http://www.ncl.ac.uk/business-school/staff/>
{'title': 'Associate Lecturer', 'link': '/business-school/staff/profile/angelamclean.html'}
2018-03-20 16:01:44 [scrapy.core.scraper] DEBUG: Scraped from <200 http://www.ncl.ac.uk/business-school/staff/>
{'title': 'UG Clerical Officer', 'link': '/business-school/staff/profile/michaelmcnally.html'}
2018-03-20 16:01:44 [scrapy.core.scraper] DEBUG: Scraped from <200 http://www.ncl.ac.uk/business-school/staff/>
{'title': None, 'link': '/business-school/staff/profile/jeanettemears.html'}
2018-03-20 16:01:44 [scrapy.core.scraper] DEBUG: Scraped from <200 http://www.ncl.ac.uk/business-school/staff/>
{'title': 'Lecturer in Organisational Behaviour and HRM, Degree Programme Director MSc International Business Management ', 'link': '/business-school/staff/profile/elinameliou.html'}
2018-03-20 16:01:44 [scrapy.core.scraper] DEBUG: Scraped from <200 http://www.ncl.ac.uk/business-school/s

2018-03-20 16:01:44 [scrapy.core.scraper] DEBUG: Scraped from <200 http://www.ncl.ac.uk/business-school/staff/>
{'title': 'Teaching Fellow in Marketing', 'link': '/business-school/staff/profile/anapopovic.html'}
2018-03-20 16:01:44 [scrapy.core.scraper] DEBUG: Scraped from <200 http://www.ncl.ac.uk/business-school/staff/>
{'title': 'Research Associate', 'link': '/business-school/staff/profile/michaelprice.html'}
2018-03-20 16:01:44 [scrapy.core.scraper] DEBUG: Scraped from <200 http://www.ncl.ac.uk/business-school/staff/>
{'title': 'Alcan Chair of Management', 'link': '/business-school/staff/profile/stephenprocter.html'}
2018-03-20 16:01:44 [scrapy.core.scraper] DEBUG: Scraped from <200 http://www.ncl.ac.uk/business-school/staff/>
{'title': 'Postgraduate Programme Secretary', 'link': '/business-school/staff/profile/kaypryer.html'}
2018-03-20 16:01:44 [scrapy.core.scraper] DEBUG: Scraped from <200 http://www.ncl.ac.uk/business-school/staff/>
{'title': 'DPD - Combined and Joint Honours P

2018-03-20 16:01:44 [scrapy.core.scraper] DEBUG: Scraped from <200 http://www.ncl.ac.uk/business-school/staff/>
{'title': 'Engagement Support Coordinator ', 'link': '/business-school/staff/profile/emmathomson1.html'}
2018-03-20 16:01:44 [scrapy.core.scraper] DEBUG: Scraped from <200 http://www.ncl.ac.uk/business-school/staff/>
{'title': 'Teacher', 'link': '/business-school/staff/profile/leslietickner.html'}
2018-03-20 16:01:44 [scrapy.core.scraper] DEBUG: Scraped from <200 http://www.ncl.ac.uk/business-school/staff/>
{'title': 'Research Associate', 'link': '/business-school/staff/profile/barbaratocco.html'}
2018-03-20 16:01:44 [scrapy.core.scraper] DEBUG: Scraped from <200 http://www.ncl.ac.uk/business-school/staff/>
{'title': None, 'link': '/business-school/staff/profile/jonathantse-sik-sun.html'}
2018-03-20 16:01:44 [scrapy.core.scraper] DEBUG: Scraped from <200 http://www.ncl.ac.uk/business-school/staff/>
{'title': None, 'link': '/business-school/staff/profile/michtvede.html'}
2018-

2018-03-20 16:01:45 [scrapy.core.scraper] DEBUG: Scraped from <200 http://www.ncl.ac.uk/business-school/staff/>
{'title': 'Lecturer in Enterprise and Innovation', 'link': '/business-school/staff/profile/janetteyoung.html'}
2018-03-20 16:01:45 [scrapy.core.scraper] DEBUG: Scraped from <200 http://www.ncl.ac.uk/business-school/staff/>
{'title': 'Research Associate', 'link': '/business-school/staff/profile/marinayusupova.html'}
2018-03-20 16:01:45 [scrapy.core.scraper] DEBUG: Scraped from <200 http://www.ncl.ac.uk/business-school/staff/>
{'title': 'Teaching Fellow', 'link': '/business-school/staff/profile/sawlatzaman.html'}
2018-03-20 16:01:45 [scrapy.core.scraper] DEBUG: Scraped from <200 http://www.ncl.ac.uk/business-school/staff/>
{'title': 'Lecturer in Accounting and Finance', 'link': '/business-school/staff/profile/yanzeng2.html'}
2018-03-20 16:01:45 [scrapy.core.scraper] DEBUG: Scraped from <200 http://www.ncl.ac.uk/business-school/staff/>
{'title': 'Lecturer in Strategic and Operat