In [1]:
from bs4 import BeautifulSoup # BeautifulSoup is in bs4 package 
import requests
import re # Regular expressions
from time import sleep # To prevent overwhelming the server between connections
from collections import Counter # Keep track of our term counts
from nltk.corpus import stopwords # Filter out stopwords, such as 'the', 'or', 'and'
import pandas as pd # For converting results to a dataframe and bar chart plots
%matplotlib inline

In [2]:
def text_cleaner(website):
    '''
    This function just cleans up the raw html so that I can look at it.
    Inputs: a URL to investigate
    Outputs: Cleaned text only
    '''
    try:
        content = requests.get(website) 
    except: 
        return   # Need this in case the website isn't there anymore or some other weird connection problem 
    
    soup_obj = BeautifulSoup(content.text, 'html.parser') # Get the html from the site
    
    for script in soup_obj(["script", "style"]):
        script.extract() # Remove these two elements from the BS4 object
    

    text = soup_obj.get_text() # Get the text from this
    
        
    
    lines = (line.strip() for line in text.splitlines()) # break into lines
    
        
        
    chunks = (phrase.strip() for line in lines for phrase in line.split("  ")) # break multi-headlines into a line each
    
    def chunk_space(chunk):
        chunk_out = chunk + ' ' # Need to fix spacing issue
        return chunk_out  
        
    
    text = ''.join(chunk_space(chunk) for chunk in chunks if chunk).encode('utf-8') # Get rid of all blank lines and ends of line
        
        
    # Now clean out all of the unicode junk (this line works great!!!)
        
    try:
        text = text.decode('unicode_escape') # Need this as some websites aren't formatted
    except:                                                            # in a way that this works, can occasionally throw
        return                                                         # an exception
       
      
    text = re.sub("[^a-zA-Z.+3#]"," ", text)  # Now get rid of any terms that aren't words (include 3 for d3.js)
                                                # Also include + for C++
        
       
    text = text.lower().split()  # Go to lower case and split them apart   
        
        
    text = list(set(text)) # Last, just get the set of these. Ignore counts (we are just looking at whether a term existed
                            # or not on the website)
        
    return text

In [3]:
result = text_cleaner('http://www.indeed.com/viewjob?jk=5505e59f8e5a32a4&q=%22data+scientist%22&tk=19ftfgsmj19ti0l3&from=web&advn=1855944161169178&sjdu=QwrRXKrqZ3CNX5W-O9jEvWC1RT2wMYkGnZrqGdrncbKqQ7uwTLXzT1_ME9WQ4M-7om7mrHAlvyJT8cA_14IV5w&pub=pub-indeed')

In [4]:
result

['be',
 'you',
 'requested',
 'found',
 'found.',
 'page',
 'again',
 'mobile',
 'not',
 'try',
 'could',
 'home',
 'the',
 'indeed']

In [5]:
def skills_info(city = None, state = None):
    '''
    This function will take a desired city/state and look for all new job postings
    on Indeed.com. It will crawl all of the job postings and keep track of how many
    use a preset list of typical data science skills. The final percentage for each skill
    is then displayed at the end of the collation. 
        
    Inputs: The location's city and state. These are optional. If no city/state is input, 
    the function will assume a national search (this can take a while!!!).
    Input the city/state as strings, such as skills_info('Chicago', 'IL').
    Use a two letter abbreviation for the state.
    
    Output: A bar chart showing the most commonly desired skills in the job market for 
    a data scientist. 
    '''
        
    final_job = 'data scientist' # searching for data scientist exact fit("data scientist" on Indeed search)
    
    # Make sure the city specified works properly if it has more than one word (such as San Francisco)
    if city is not None:
        final_city = city.split() 
        final_city = '+'.join(word for word in final_city)
        final_site_list = ['http://www.indeed.com/jobs?q=', final_job, '&l=', final_city,
                    '%2C+', state] # Join all of our strings together so that indeed will search correctly
    else:
        final_site_list = ['http://www.indeed.com/jobs?q="', final_job, '"']

    final_site = ''.join(final_site_list) # Merge the html address together into one string

    
    base_url = 'http://www.indeed.com'
    
    
    try:
        html = requests.get(final_site) # Open up the front page of our search first
    except:
        'That city/state combination did not have any jobs. Exiting . . .' # In case the city is invalid
        return
    soup = BeautifulSoup(html.text, 'html.parser') # Get the html from the first page
    
    # Now find out how many jobs there were
    
    num_jobs_area = soup.find(id = 'searchCount').encode('utf-8').decode('unicode_escape') # Now extract the total number of jobs found
                                                                        # The 'searchCount' object has this
    total_num_jobs = re.findall('of\s(\d\d,?\d+)', num_jobs_area)[0] # Extract the total jobs found from the search result
  
    print(total_num_jobs)
    

In [6]:
skills_info(city = 'New York', state = 'NY')

387


In [184]:
def skills_info(city = None, state = None):
    '''
    This function will take a desired city/state and look for all new job postings
    on Indeed.com. It will crawl all of the job postings and keep track of how many
    use a preset list of typical data science skills. The final percentage for each skill
    is then displayed at the end of the collation. 
        
    Inputs: The location's city and state. These are optional. If no city/state is input, 
    the function will assume a national search (this can take a while!!!).
    Input the city/state as strings, such as skills_info('Chicago', 'IL').
    Use a two letter abbreviation for the state.
    
    Output: A bar chart showing the most commonly desired skills in the job market for 
    a data scientist. 
    '''
        
    final_job = 'data scientist' # searching for data scientist exact fit("data scientist" on Indeed search)
    
    # Make sure the city specified works properly if it has more than one word (such as San Francisco)
    if city is not None:
        final_city = city.split() 
        final_city = '+'.join(word for word in final_city)
        final_site_list = ['http://www.indeed.com/jobs?q=', final_job, '&l=', final_city,
                    '%2C+', state] # Join all of our strings together so that indeed will search correctly
    else:
        final_site_list = ['http://www.indeed.com/jobs?q="', final_job, '"']

    final_site = ''.join(final_site_list) # Merge the html address together into one string

    
    base_url = 'http://www.indeed.com'
    
    
    try:
        html = requests.get(final_site) # Open up the front page of our search first
    except:
        print('That city/state combination did not have any jobs. Exiting . . .') # In case the city is invalid
        return
    soup = BeautifulSoup(html.text, 'html.parser') # Get the html from the first page
    
    # Now find out how many jobs there were
    
    num_jobs_area = soup.find(id = 'searchCount').encode('utf-8').decode('unicode_escape') # Now extract the total number of jobs found
                                                                        # The 'searchCount' object has this
    total_num_jobs = re.findall('of\s(\d\d,?\d+)', num_jobs_area)[0] # Extract the total jobs found from the search result
  
    print(total_num_jobs)
    
    city_title = city
    if city is None:
        city_title = 'Nationwide'
        
    print('There were', total_num_jobs, 'jobs found in', city_title + ',' + state) # Display how many jobs were found
    
    num_pages = int(int(total_num_jobs)/10) # This will be how we know the number of times we need to iterate over each new
                                      # search result page
    job_descriptions = [] # Store all our descriptions in this list
    
    for i in range(1,num_pages+1): # Loop through all of our search result pages
        print('Getting page', i)
        start_num = str(i*10) # Assign the multiplier of 10 to view the pages we want
        current_page = ''.join([final_site, '&start=', start_num])
        # Now that we can view the correct 10 job returns, start collecting the text samples from each
            
        html_page = requests.get(current_page) # Get the page
        soup = BeautifulSoup(html_page.text, 'html.parser')
        
        page_obj = soup # Locate all of the job links
        job_link_area = page_obj.find(id = "resultsCol") # The center column on the page where the job postings exist
        
        job_links = job_link_area.find_all('a')
        
        for link in job_links:
            job_hyperlinks = link.get('href')
            
        
        job_related_URLS = []
        for hyperlink in job_hyperlinks:
            if 'clk' in str(hyperlink) == True:
                job_related_URLS.append(hyperlink)
            else:
                pass
        print(job_related_URLS)
        
       
        
     
        
        

In [185]:
skills_info(city = 'New York', state = 'NY')

475
There were 475 jobs found in New York,NY
Getting page 1
[]
Getting page 2
[]
Getting page 3
[]
Getting page 4
[]
Getting page 5
[]
Getting page 6
[]
Getting page 7
[]
Getting page 8
[]
Getting page 9
[]
Getting page 10
[]
Getting page 11
[]
Getting page 12
[]
Getting page 13
[]
Getting page 14
[]
Getting page 15
[]
Getting page 16
[]
Getting page 17
[]
Getting page 18
[]
Getting page 19
[]
Getting page 20
[]
Getting page 21
[]
Getting page 22
[]
Getting page 23
[]
Getting page 24
[]
Getting page 25
[]
Getting page 26
[]
Getting page 27
[]
Getting page 28
[]
Getting page 29
[]
Getting page 30
[]
Getting page 31
[]
Getting page 32
[]
Getting page 33
[]
Getting page 34
[]
Getting page 35
[]
Getting page 36
[]
Getting page 37
[]
Getting page 38
[]
Getting page 39
[]
Getting page 40
[]
Getting page 41
[]
Getting page 42
[]
Getting page 43
[]
Getting page 44
[]
Getting page 45
[]
Getting page 46
[]
Getting page 47
[]
