# Extracting and Analyzing Data Science Job Postings
## STA 141B Final Project

### Questions from proposal:

- Analyze industry data job postings (data analysts, data scientists, data engineers, etc…)
	- Scrape websites for their data science job listings
    - indeed, monster, cybercoders

- Identify the requirements (skills, experiences, education, etc…) 

- Identify salary ranges and benefits.

- How does location and the geographic region influence the components we list above?

- What kinds of software and technologies are workers in this field using?

In [8]:
import pandas as pd
import numpy as np
import requests
import requests_cache
import time
import lxml.html as lx
import itertools
import sqlite3 as sql

In [9]:
requests_cache.install_cache("ds_cache")

In [10]:
states_cap = ["Alabama","Alaska","Arizona","Arkansas","California","Colorado",
  "Connecticut","Delaware","Florida","Georgia","Hawaii","Idaho","Illinois",
  "Indiana","Iowa","Kansas","Kentucky","Louisiana","Maine","Maryland",
  "Massachusetts","Michigan","Minnesota","Mississippi","Missouri","Montana",
  "Nebraska","Nevada","New Hampshire","New Jersey","New Mexico","New York",
  "North Carolina","North Dakota","Ohio","Oklahoma","Oregon","Pennsylvania",
  "Rhode Island","South Carolina","South Dakota","Tennessee","Texas","Utah",
  "Vermont","Virginia","Washington","West Virginia","Wisconsin","Wyoming"]
states = [state.lower() for state in states_cap]
jobs = ['data+scientist']
states = [state.replace(' ', '+') for state in states]

# part 1, scraping links

In [11]:
def scrape_front_page_indeed(base_url):
    """
    Scrape one page of Indeed by getting links to all job postings on the page, and providing the link to the next page
    Input: Url of the page you wish to scrape
    Output: If a blank string was passed for url, then we exhausted all pages so return None.
            If a regular url was passed, then a tuple with the the list of links as the first element, 
            and the link to the next page as the second element is returned 
    """
    if base_url == '':
        return None
    else:
        response = requests.get(base_url)
        response.raise_for_status()
        html = lx.fromstring(response.text)
        html.make_links_absolute(base_url)

        # get links to each posting
        links = []
        
        sponsered_tags = html.xpath("//td[contains(@id, 'resultsCol')]/div/div/a")
        sponsered_links = [tag.attrib['href'] for tag in sponsered_tags][1:]
        for link in sponsered_links:
            links.append(link)
        
        reg_tags = html.xpath("//td[contains(@id, 'resultsCol')]/div/h2/a")
        reg_links = [link.attrib['href'] for link in reg_tags]
        for link in reg_links:
            links.append(link)

         # get links to the next page
        nav_tags = html.xpath("//td[contains(@id, 'resultsCol')]/div[contains(@class, 'pagination')]/a/span[contains(@class,'pn')]/span[contains(@class,'np')]/../..")
        if len(nav_tags) == 2:
            # previous page, next page --> take next page
            next_url = nav_tags[1].attrib['href']
        elif len(nav_tags) == 0:
            # only one page of search results
            next_url = ''
        else:
            # could be either only previous or only next --> return '' if only previous, return link if next
            nav_texts = nav_tags[0].text_content()
            if 'Previous' in nav_texts:
                next_url = ''
            else:
                next_url = nav_tags[0].attrib['href']
                
        # make sure we only have links to job postings and nothing else:
        links_copy = links.copy()

        for link in links_copy:
            if 'forum' in link:
                links.remove(link)
            elif 'salaries' in link:
                links.remove(link)

        time.sleep(0.01)
        
        
        return list(set(links)), next_url

In [16]:
def get_all_links(first_page_url):
    """
    Get all the links from one search specification.
    Input: url of the first page returned from the search
    Output: list of all the links to job postings from that search
    """
    # start with scraping the first page
    links, next_url = scrape_front_page_indeed(first_page_url)
    
    # initialize posting list
    postings = [link for link in links]
    
    # recall function until you 
    count = 0
    while count < 11:
        new_links, next_url = scrape_front_page_indeed(next_url)
        for link in new_links:
            postings.append(link)
        count += 1
    
    return list(set(postings))

In [19]:
base_url = 'https://www.indeed.com/jobs?q=data+scientist&l=california'

In [17]:
my_dict = {}
for comb in itertools.product(jobs, states):
    my_dict[comb] = get_all_links('https://www.indeed.com/jobs?q={}&l={}'.format(comb[0], comb[1]))

my_dict

TypeError: cannot unpack non-iterable NoneType object

# part 2, scraping posts

In [None]:
def scrape_post(url):
    """
    Scrape one job post from Indeed.
    Input: url of the page to scrape
    Output: dictionary with title of job, company, location, text of the post, and the footer
    """
    try:
        response = requests.get(url)
        response.raise_for_status()
        html = lx.fromstring(response.text)
        html.make_links_absolute
    except:
        return {'url': url, 'title':'', 'info':'', 'text':'', 'footer':''} 
        
    results = {}
    
    results['url'] = url
    
    try:
        title = html.xpath("//div[contains(@class, 'jobsearch-JobComponent')]//h3[contains(@class, 'jobsearch-JobInfoHeader-title')]")[0].text_content()
        results['job_title'] = title
    except:
        results['job_title'] = ''
    
    try:
        info = [tag.text_content() for tag in html.xpath("//div[contains(@class,'jobsearch-DesktopStickyContainer')]/div") if tag.text_content() != '']
        results['info'] = ' '.join(info)
    except:
        results['info'] = ''
    
    try:
        text = html.xpath("//div[contains(@class, 'jobsearch-JobComponent-description')]")[0].text_content()
        results['text'] = text
    except:
        results['text'] = ''
        
    try:
        footer = html.xpath("//div[contains(@class, 'jobsearch-JobMetadataFooter')]")[0].text_content()
        results['footer'] = footer
    except:
        results['footer'] = ''

    return results

# part 3, combine posts into dataframe

In [None]:
def make_df(key, links):
    posts = [scrape_post(link) for link in links]
    df = pd.concat([pd.DataFrame(posts)])
    df['jobsearchterm'] = key[0]
    df['state'] = key[1]
    return df

In [None]:
dfs = []
for key, value in my_dict.items():
    add_df = make_df(key, value)
    dfs.append(add_df)
    
dfs

In [None]:
dfs