# Scraping Skills and Their Job Categories From Guru

This script doesn't take user information. It is just extracting skill / sub-category / category labels that I use to reduce the dimensionality of the 2,000+ skills to about 9 general categories.

In [1]:
# Scraping Libraries
from selenium import webdriver
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait 
import time
from bs4 import BeautifulSoup

# Data Management
import pandas as pd
import numpy as np

# SQL Import
from sqlalchemy import create_engine
from sqlalchemy_utils import database_exists, create_database
import psycopg2


## Defining Functions

1. Extract the locations of the categories
2. Extract the locations of the sub-categories (opens after clicking a category)
3. Extract the locations of the skills (opens after clicking a sub-category)
4. Create dataframe from text (broadcasting fills in for category and sub-category).
5. Append all the dataframes (each one is for just one cat-sub-cat combo)
6. Save to database

In [2]:
def extract_category_paths():
    """
    This extracts all the first-level (category) links to get deeper into the skill tree.
    
    Returns a list of the locations of all the category buttons.
    """
    categories = driver.find_elements_by_xpath(
        '//*[@id="search-app"]/section/div/div[1]/div[2]/div[1]/div/div/div/div[2]/div/div[1]/div/div[2]/div/ul/li/label')
    del categories[0] # Removing "Any Category" Element
    return categories

In [3]:
def extract_subcategory_paths():
    """
    For a given category it will extract all the sub-category links (level 2)
    
    Returns a list of sub-categories
    """
    subcategories = driver.find_elements_by_xpath(
    '//*[@id="search-app"]/section/div/div[1]/div[2]/div[1]/div/div/div/div[2]/div/div[2]/div/div[2]/div/ul/li/label')
    
    # Sometimes there are no sub-categories. In that case it sets the value = NaN
    # However, if there are sub-categories, I need to remove the "Any Category" item.
    if len(subcategories) == 1:
        subcategories = ["NaN"]
    else:
        del subcategories[0] # Removing "Any Category" Element

    return subcategories

In [4]:
def extract_skills():
    """
    For a given category - subcategory grouping extract all the skills
    """
    
    skills = driver.find_elements_by_xpath(
    '//*[@id="search-app"]/section/div/div[1]/div[2]/div[1]/div/div/div/div[2]/div/div[3]/div/div[2]/div/ul/li')
    
    # If there are no skills, return NaN
    if len(skills) == 0:
        skills = ["NaN"]
    
    return skills

In [5]:
def make_skills_dataframe(c,sc,sk):
    """
    Creates a dataframe for each cat-subcat-skills list combo.
    These are later appended to form a single dataframe.
    
    Inputs:
        - c: A category selenium element
        - sc: A sub-category selenium element
        - skills: List where each element is a skill
          associated with the given category - sub-category.
          
    Returns:
        - data_frame: database of category - sub-category - skills. Nrow = # skills
    """
    
    # Extract text from the category elements
    categories = c.get_attribute('innerText')

    # Extract text from the sub-category elements.
    # If there is no subcategory it will just return NaN (assigned in extract func)
    try:
        sub_categories = sc.get_attribute('innerText')
    except:
        sub_categories = sc
        
    # Extract text from skills. Returns NaN if there are no skills.
    try:
        skills = [item.get_attribute('innerText') for item in sk]
    except:
        skills = sk
        
    data = {'Category': categories, 'Subcategory': sub_categories, 'Skills': skills}
    data_frame = pd.DataFrame(data)
    return data_frame

In [6]:
def add_table_to_db(dataframe,table_name):
    """
    Adds the data to a new table (details_table) in freelance_db.
    
    Input:
        - dataframe: The data I want to put into the SQL database
        - table_name: Assign a name to the table in the database
        
    Returns:
        - Certificate of completion
    
    """
    # Try to figure out how to put these into a config file later.
    dbname = 'freelance_db'
    username = 'Metaverse'
    pswd = 'arcifice91'
    
    # Connect to the database
    engine = create_engine('postgresql://%s:%s@localhost/%s'%(username,pswd,dbname))
    print('postgresql://%s:%s@localhost/%s'%(username,pswd,dbname))

    ## insert data into database from Python (proof of concept - this won't be useful for big data, of course)
    ## df is any pandas dataframe 
    dataframe.to_sql(table_name, engine, if_exists='replace')
    
    print("Added data to %s"%(dbname))

## Executing. Scraping and Saving to Database

In [7]:
# Accessing page
driver = webdriver.Firefox()
driver.get("https://www.guru.com/d/jobs/")
driver.find_element_by_xpath('/html/body/form/main/main/section/div/div[1]/div[2]/div[1]/div/button').click()

# Extracting the category elements
categories = extract_category_paths()

# Looping over each category element to extract the sub-categories
# Then looping through the sub-categories to extract the skills
for j, val in enumerate(categories):

    val.click()
    time.sleep(0.5)
    
    # Extracting sub-categories
    sub_categories = extract_subcategory_paths()

    for k, sub_val in enumerate(sub_categories):
        
        # Looping over sub-categories.
        # First, clicking a given sub-category to open skills menu
        # If it fails (no sub-categories) return NaN
        try:
            sub_val.click()
            time.sleep(0.5)
        except:
            sub_val = ["NaN"]
            
        # Extracting skills. If no skills return NaN
        try:
            skills = extract_skills()
        except:
            skills = ["NaN"]

                    
        # Creating dataframe. When on the first category and first sub-category
        # create the dataframe from scratch. 
        # For all over iterations create a temporary dataframe and append it to
        # the main dataframe.
        if (k == 0) & (j == 0):
            print("Made initial data")
            dt = make_skills_dataframe(val, 
                                       sub_val, 
                                       skills)
        else:
            tmp = make_skills_dataframe(val,
                                        sub_val,
                                        skills)
            dt = dt.append(tmp)

# End Session
driver.close()

Made initial data


In [13]:
# add_table_to_db(dt,"skills_categories_table")

postgresql://Metaverse:arcifice91@localhost/freelance_db
Added data to freelance_db
