# Scraping Skills and Their Job Categories From Guru

In [1]:
# Scraping Libraries
from selenium import webdriver
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait 
import time
from bs4 import BeautifulSoup

# Data Management
import pandas as pd
import numpy as np

# SQL Import
from sqlalchemy import create_engine
from sqlalchemy_utils import database_exists, create_database
import psycopg2


### Defining Functions

In [2]:
def extract_category_paths():
    """
    This extracts all the first-level links to get deeper into the skill tree.
    """
    categories = driver.find_elements_by_xpath(
        '//*[@id="search-app"]/section/div/div[1]/div[2]/div[1]/div/div/div/div[2]/div/div[1]/div/div[2]/div/ul/li/label')
    del categories[0] # Removing "Any Category" Element
    return categories

In [3]:
def extract_subcategory_paths():
    """
    For a given category it will extract all the sub-category links (level 2)
    """
    subcategories = driver.find_elements_by_xpath(
    '//*[@id="search-app"]/section/div/div[1]/div[2]/div[1]/div/div/div/div[2]/div/div[2]/div/div[2]/div/ul/li/label')
    
    if len(subcategories) == 1:
        subcategories = ["NaN"]
    else:
        del subcategories[0] # Removing "Any Category" Element

    return subcategories

In [4]:
def extract_skills():
    """
    For a given category - subcategory grouping extract all the skills
    """
    
    skills = driver.find_elements_by_xpath(
    '//*[@id="search-app"]/section/div/div[1]/div[2]/div[1]/div/div/div/div[2]/div/div[3]/div/div[2]/div/ul/li')
    
    if len(skills) == 0:
        skills = ["NaN"]
    
    return skills

In [5]:
def make_skills_dataframe(c,sc,sk):
    """
    Creates a dataframe for each cat-subcat-skills list combo. Can then append all of these
    """
    categories = c.get_attribute('innerText')

    try:
        sub_categories = sc.get_attribute('innerText')
    except:
        sub_categories = sc
        
    try:
        skills = [item.get_attribute('innerText') for item in sk]
    except:
        skills = sk
        
    data = {'Category': categories, 'Subcategory': sub_categories, 'Skills': skills}
    data_frame = pd.DataFrame(data)
    return data_frame

In [6]:
def add_table_to_db(dataframe,table_name):
    """
    Adds the data to a new table (details_table) in freelance_db.
    
    """
    # Try to figure out how to put these into a config file later.
    dbname = 'freelance_db'
    username = 'Metaverse'
    pswd = 'arcifice91'
    
    # Connect to the database
    engine = create_engine('postgresql://%s:%s@localhost/%s'%(username,pswd,dbname))
    print('postgresql://%s:%s@localhost/%s'%(username,pswd,dbname))

    ## insert data into database from Python (proof of concept - this won't be useful for big data, of course)
    ## df is any pandas dataframe 
    dataframe.to_sql(table_name, engine, if_exists='replace')
    
    print("Added data to %s"%(dbname))

### Putting it all together

In [7]:
driver = webdriver.Firefox()
driver.get("https://www.guru.com/d/jobs/")
driver.find_element_by_xpath('/html/body/form/main/main/section/div/div[1]/div[2]/div[1]/div/button').click()

categories = extract_category_paths()
 
for j, val in enumerate(categories):

    val.click()
    time.sleep(0.5)
    sub_categories = extract_subcategory_paths()

    for k, sub_val in enumerate(sub_categories):
        
        try:
            sub_val.click()
            time.sleep(0.5)
        except:
            sub_val = ["NaN"]
            
        try:
            skills = extract_skills()
        except:
            skills = ["NaN"]

                    
        if (k == 0) & (j == 0):
            print("Made initial data")
            dt = make_skills_dataframe(val, 
                                       sub_val, 
                                       skills)
        else:
            tmp = make_skills_dataframe(val,
                                        sub_val,
                                        skills)
            dt = dt.append(tmp)

driver.close()

Made initial data


### Exporting to SQL Database

In [13]:
# add_table_to_db(dt,"skills_categories_table")

postgresql://Metaverse:arcifice91@localhost/freelance_db
Added data to freelance_db
