# Scraping Challenge

Do a little scraping or API-calling of your own. Pick a new website and see what you can get out of it. Expect that you'll run into bugs and blind alleys, and rely on your mentor to help you get through.

Formally, your goal is to write a scraper that will:

1. Return specific pieces of information (rather than just downloading a whole page)
2. Iterate over multiple pages/queries
3. Save the data to your computer

Once you have your data, compute some statistical summaries and/or visualizations that give you some new insights into your scraping topic of interest. Write up a report from scraping code to summary and share it with your mentor.

### The Website

What if I use glassdoor to get company information to prepare for an interview?
1. All interview questions

In [1]:
################## Imports ##############################
# Importing in each cell because of the kernel restarts.
import scrapy
import re
from scrapy.crawler import CrawlerProcess
import numpy as np
import pandas as pd


In [4]:
class GlassdoorSpider(scrapy.Spider):
    # Naming the spider is important if you are running more than one spider of
    # this class simultaneously.
    name = "ESS"
    
    # URL(s) to start with.
    start_urls = [
        'https://www.glassdoor.com/Interview/Facebook-Interview-Questions-E40772.htm?filter.jobTitleFTS=Data+Scientist',
    ]

    # Use XPath to parse the response we get.
    def parse(self, response):
        
        # Iterate over every <article> element on the page.
        for review in response.xpath('//empReview cf'):
            
            # Yield a dictionary with the values we want.
            yield {
#                 'name': review.xpath('header/h2/a/@title').extract_first(),
#                 'date': article.xpath('header/section/span[@class="entry-date"]/text()').extract_first(),
                'interviewquestions': review.xpath('section[@class="interviewQuestions"]/p/text()').extract(),
#                 'tags': article.xpath('*/span[@class="tag-links"]/a/text()').extract()
            }
        # Get the URL of the previous page.
#         next_page = response.xpath('//div[@class="nav-previous"]/a/@href').extract_first()
        
#         # There are a LOT of pages here.  For our example, we'll just scrape the first 9.
#         # This finds the page number. The next segment of code prevents us from going beyond page 9.
#         pagenum = int(re.findall(r'\d+',next_page)[0])
        
#         # Recursively call the spider to run on the next page, if it exists.
#         if next_page is not None and pagenum < 10:
#             next_page = response.urljoin(next_page)
#             # Request the next page and recursively parse it the same way we did above
#             yield scrapy.Request(next_page, callback=self.parse)

# Tell the script how to run the crawler by passing in settings.
# The new settings have to do with scraping etiquette.          
process = CrawlerProcess({
    'FEED_FORMAT': 'json',         # Store data in JSON format.
    'FEED_URI': 'data.json',       # Name our storage file.
    'LOG_ENABLED': False,          # Turn off logging for now.
    'ROBOTSTXT_OBEY': True,
    'USER_AGENT': 'ThinkfulDataScienceBootcampCrawler (thinkful.com)',
    'AUTOTHROTTLE_ENABLED': True,
    'HTTPCACHE_ENABLED': True
})

# Start the crawler with our spider.
process.crawl(GlassdoorSpider)
process.start()
print('Success!')

Success!


In [1]:
#Quick Test - kind of works but trying a different approach
import scrapy
import re
from scrapy.crawler import CrawlerProcess
class GlassdoorSpider(scrapy.Spider):
    # Naming the spider is important if you are running more than one spider of
    # this class simultaneously.
    name = "Glassdoor_Simple"
    
    # URL(s) to start with.
    start_urls = [
        'https://www.glassdoor.com/Interview/Facebook-Interview-Questions-E40772.htm?filter.jobTitleFTS=Data+Scientist',
    ]

    # Use XPath to parse the response we get.
    def parse(self, response):
        print(response)
        # Iterate over every review class element on the page.
        count=0 
        # This kind of works but want to try something else
        for review in response.xpath('//*[starts-with(@id, "InterviewReview_")]'):
            
            print(review)
            # Yield a dictionary with the values we want.
            yield {
#                 # This is the code to choose what we want to extract
#                 # You can modify this with other Xpath expressions to extract other information from the site
                'interviewquestions': review.xpath('//*[starts-with(@id, "InterviewReview_")]/div[3]/div/div[2]/div[2]/div/div').extract_first(),
            }

# Tell the script how to run the crawler by passing in settings.
process = CrawlerProcess({
    'FEED_FORMAT': 'json',         # Store data in JSON format.
    'FEED_URI': 'firstpage.json',  # Name our storage file.
    'LOG_ENABLED': False ,          # Turn off logging for now.
    'ROBOTSTXT_OBEY': True,
    'USER_AGENT': 'ThinkfulDataScienceBootcampCrawler (thinkful.com)',
    'AUTOTHROTTLE_ENABLED': True,
    'HTTPCACHE_ENABLED': True
})

# Start the crawler with our spider.
process.crawl(GlassdoorSpider)
process.start()
print('Success!')

<200 https://www.glassdoor.com/Interview/Facebook-Interview-Questions-E40772.htm?filter.jobTitleFTS=Data+Scientist>
<Selector xpath='//*[starts-with(@id, "InterviewReview_")]' data='<li class=" empReview cf " id="Interview'>
<Selector xpath='//*[starts-with(@id, "InterviewReview_")]' data='<li class=" empReview cf " id="Interview'>
<Selector xpath='//*[starts-with(@id, "InterviewReview_")]' data='<li class=" empReview cf " id="Interview'>
<Selector xpath='//*[starts-with(@id, "InterviewReview_")]' data='<li class=" empReview cf " id="Interview'>
<Selector xpath='//*[starts-with(@id, "InterviewReview_")]' data='<li class=" empReview cf " id="Interview'>
<Selector xpath='//*[starts-with(@id, "InterviewReview_")]' data='<li class=" empReview cf " id="Interview'>
<Selector xpath='//*[starts-with(@id, "InterviewReview_")]' data='<li class=" empReview cf " id="Interview'>
<Selector xpath='//*[starts-with(@id, "InterviewReview_")]' data='<li class=" empReview cf " id="Interview'>
<Selector xp

In [1]:
#@ This one works!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!

import scrapy
import re
from scrapy.crawler import CrawlerProcess
class GlassdoorSpider(scrapy.Spider):
    # Naming the spider is important if you are running more than one spider of
    # this class simultaneously.
    name = "Glassdoor_Simple"
    
    # URL(s) to start with.
    start_urls = [
        'https://www.glassdoor.com/Interview/Facebook-Interview-Questions-E40772.htm?filter.jobTitleFTS=Data+Scientist',
    ]

    # Use XPath to parse the response we get.
    def parse(self, response):
        print(response)
        # Iterate over every review class element on the page.
        count=0 
        # Get all the reviews objects for the page
        reviews = response.xpath('//*[starts-with(@id, "InterviewReview_")]')
        #Extract the different infomration
        for review in response.xpath('//*[starts-with(@id, "InterviewReview_")]'):
            
            #print(review.extract())
            # Yield a dictionary with the values we want.
            yield {
#                 # This is the code to choose what we want to extract
#                 # You can modify this with other Xpath expressions to extract other information from the site
                'interviewquestions': review.xpath('.//div[3]/div/div[2]/div[2]/div/div/ul/li/span/text()').extract(),
            }
#//*[@id="InterviewReview_10544127"]/div[3]/div/div[2]/div[2]/div/div/ul/li/span/text()[1]
# Tell the script how to run the crawler by passing in settings.
process = CrawlerProcess({
    'FEED_FORMAT': 'json',         # Store data in JSON format.
    'FEED_URI': 'firstpage.json',  # Name our storage file.
    'LOG_ENABLED': False ,          # Turn off logging for now.
    'ROBOTSTXT_OBEY': True,
    'USER_AGENT': 'ThinkfulDataScienceBootcampCrawler (thinkful.com)',
    'AUTOTHROTTLE_ENABLED': True,
    'HTTPCACHE_ENABLED': True
})

# Start the crawler with our spider.
process.crawl(GlassdoorSpider)
process.start()
print('Success!')

<200 https://www.glassdoor.com/Interview/Facebook-Interview-Questions-E40772.htm?filter.jobTitleFTS=Data+Scientist>
Success!
